官网地址 https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/python/python_udfs.html
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction,ScalarFunction,udtf
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
#1
class SplitStr(TableFunction):
def eval(self, str_value):
str_arr = str_value.split('|')
yield int(str_arr[0]),int(str_arr[1])
yield int(str_arr[0]),int(str_arr[1])
splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])
t_env.register_function("splitStr", splitStr)
# Like Python scalar functions, you can use the above five ways to define Python TableFunctions.
# The only difference is that the return type of Python Table Functions needs to be an iterable, iterator or generator.
#2
# @udtf(input_types=[DataTypes.STRING()], result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])
# def splitStr(str_value):
# #return type needs to be an iterable, iterator or generator
# # yield 1
# # yield 2
# # or
# # result = [1, 2, 3]
# # or
# result = [1,1]
# return result
t_env.register_function("splitStr", splitStr)
t_env.sql_update("""
CREATE TABLE mySource (
a varchar,
b varchar
) WITH (
'connector' = 'kafka',
'topic' = 'mytesttopic',
'properties.bootstrap.servers' = '172.17.0.2:9092',
'properties.group.id' = 'flink-test-cxy',
'scan.startup.mode' = 'latest-offset',
'format' = 'json'
)
""")
t_env.sql_update("""
CREATE TABLE mySink (
a bigint,
b bigint
) WITH (
'connector' = 'print'
)
""")
t_env.sql_update("insert into mySink select a1,a2 from mySource,LATERAL TABLE(splitStr(a)) as T(a1, a2)")
t_env.execute("job")
kafka接收json格式数据(复杂类型)
from sklearn import linear_model
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes,types
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction, ScalarFunction, udtf
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
class SplitStr(TableFunction):
def eval(self, data):
for row in data:
yield row[0], row[1]
# 这个地方其实是不对的,这里虽然输入输出类型不对应,但是能运行
# 中文用户邮箱我提了一个问题
# http://apache-flink.147419.n8.nabble.com/flinksql-udtf-ROW-td7660.html
splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])
t_env.register_function("splitStr", splitStr)
t_env.sql_update("""
CREATE TABLE mySource (
id varchar,
data array<ROW<name STRING,age STRING>>
) WITH (
'connector' = 'kafka',
'topic' = 'mytesttopic',
'properties.bootstrap.servers' = '172.17.0.2:9092',
'properties.group.id' = 'flink-test-cxy',
'scan.startup.mode' = 'latest-offset',
'format' = 'json'
)
""")
t_env.sql_update("""
CREATE TABLE mysqlsink (
id varchar
,name varchar
,age varchar
)
with (
'connector' = 'print'
)
""")
t_env.sql_update("insert into mysqlsink select id,name,age from mySource ,LATERAL TABLE(splitStr(data)) as T(name, age)")
t_env.execute("qwe")