1.11.0 pyflink udtf自定义函数

官网地址 https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/python/python_udfs.html

from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction,ScalarFunction,udtf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
#1
class SplitStr(TableFunction):
	def eval(self, str_value):
		str_arr = str_value.split('|')
		yield int(str_arr[0]),int(str_arr[1])
		yield int(str_arr[0]),int(str_arr[1])

splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])
t_env.register_function("splitStr", splitStr)

# Like Python scalar functions, you can use the above five ways to define Python TableFunctions.
# The only difference is that the return type of Python Table Functions needs to be an iterable, iterator or generator.
#2 
# @udtf(input_types=[DataTypes.STRING()], result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])
# def splitStr(str_value):
# 	#return type needs to be an iterable, iterator or generator
# 	# yield 1
# 	# yield 2
# 	# or
# 	# result = [1, 2, 3]
# 	# or
# 	result = [1,1]
# 	return result
	
t_env.register_function("splitStr", splitStr)

t_env.sql_update("""
	CREATE TABLE mySource (                                       
		a varchar,                                                    
		b varchar                                                
	) WITH (                                                         
	'connector' = 'kafka',
	'topic' = 'mytesttopic',
	'properties.bootstrap.servers' = '172.17.0.2:9092',
	'properties.group.id' = 'flink-test-cxy',
	'scan.startup.mode' = 'latest-offset',
	'format' = 'json'                                   
	) 
""")
t_env.sql_update("""
	CREATE TABLE mySink (                                       
		a bigint,                                                    
		b bigint                                                
	) WITH (                                                         
	'connector' = 'print'       
	) 
""")
t_env.sql_update("insert into mySink select a1,a2 from mySource,LATERAL TABLE(splitStr(a)) as T(a1, a2)")
t_env.execute("job")

kafka接收json格式数据(复杂类型)


from sklearn import linear_model
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes,types
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction, ScalarFunction, udtf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')

class SplitStr(TableFunction):
    def eval(self, data):
        for row in data:
            yield row[0], row[1]

# 这个地方其实是不对的,这里虽然输入输出类型不对应,但是能运行
# 中文用户邮箱我提了一个问题
# http://apache-flink.147419.n8.nabble.com/flinksql-udtf-ROW-td7660.html
splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])


t_env.register_function("splitStr", splitStr)

t_env.sql_update("""
	CREATE TABLE mySource (                                                                                        
		id varchar,
		data array<ROW<name STRING,age STRING>>                                 
	) WITH (                                                         
		'connector' = 'kafka',
        'topic' = 'mytesttopic',
        'properties.bootstrap.servers' = '172.17.0.2:9092',
        'properties.group.id' = 'flink-test-cxy',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json'                                     
	) 
""")
t_env.sql_update("""
	CREATE TABLE mysqlsink (
		id varchar
		,name varchar
		,age  varchar
	) 
	with (
	    'connector' = 'print'
	)
""")
t_env.sql_update("insert into mysqlsink select id,name,age from mySource ,LATERAL TABLE(splitStr(data)) as T(name, age)")

t_env.execute("qwe")

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值