1.11.0 pyflink udtf自定义函数

最新推荐文章于 2024-03-05 21:21:32 发布

菜到抠脚的cxy

最新推荐文章于 2024-03-05 21:21:32 发布

阅读量1.1k

点赞数

分类专栏： Apache Flink python

本文链接：https://blog.csdn.net/u010034713/article/details/107673590

版权

Apache Flink 同时被 2 个专栏收录

19 篇文章 7 订阅

订阅专栏

python

9 篇文章 0 订阅

订阅专栏

官网地址 https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/python/python_udfs.html

from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction,ScalarFunction,udtf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
#1
class SplitStr(TableFunction):
	def eval(self, str_value):
		str_arr = str_value.split('|')
		yield int(str_arr[0]),int(str_arr[1])
		yield int(str_arr[0]),int(str_arr[1])

splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])
t_env.register_function("splitStr", splitStr)

# Like Python scalar functions, you can use the above five ways to define Python TableFunctions.
# The only difference is that the return type of Python Table Functions needs to be an iterable, iterator or generator.
#2 
# @udtf(input_types=[DataTypes.STRING()], result_types=[DataTypes.BIGINT(), DataTypes.BIGINT()])
# def splitStr(str_value):
# 	#return type needs to be an iterable, iterator or generator
# 	# yield 1
# 	# yield 2
# 	# or
# 	# result = [1, 2, 3]
# 	# or
# 	result = [1,1]
# 	return result
	
t_env.register_function("splitStr", splitStr)

t_env.sql_update("""
	CREATE TABLE mySource (                                       
		a varchar,                                                    
		b varchar                                                
	) WITH (                                                         
	'connector' = 'kafka',
	'topic' = 'mytesttopic',
	'properties.bootstrap.servers' = '172.17.0.2:9092',
	'properties.group.id' = 'flink-test-cxy',
	'scan.startup.mode' = 'latest-offset',
	'format' = 'json'                                   
	) 
""")
t_env.sql_update("""
	CREATE TABLE mySink (                                       
		a bigint,                                                    
		b bigint                                                
	) WITH (                                                         
	'connector' = 'print'       
	) 
""")
t_env.sql_update("insert into mySink select a1,a2 from mySource,LATERAL TABLE(splitStr(a)) as T(a1, a2)")
t_env.execute("job")

kafka接收json格式数据(复杂类型)


from sklearn import linear_model
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes,types
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf, TableFunction, ScalarFunction, udtf

env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')

class SplitStr(TableFunction):
    def eval(self, data):
        for row in data:
            yield row[0], row[1]

# 这个地方其实是不对的,这里虽然输入输出类型不对应,但是能运行
# 中文用户邮箱我提了一个问题
# http://apache-flink.147419.n8.nabble.com/flinksql-udtf-ROW-td7660.html
splitStr = udtf(SplitStr(), DataTypes.STRING(), [DataTypes.BIGINT(), DataTypes.BIGINT()])


t_env.register_function("splitStr", splitStr)

t_env.sql_update("""
	CREATE TABLE mySource (                                                                                        
		id varchar,
		data array<ROW<name STRING,age STRING>>                                 
	) WITH (                                                         
		'connector' = 'kafka',
        'topic' = 'mytesttopic',
        'properties.bootstrap.servers' = '172.17.0.2:9092',
        'properties.group.id' = 'flink-test-cxy',
        'scan.startup.mode' = 'latest-offset',
        'format' = 'json'                                     
	) 
""")
t_env.sql_update("""
	CREATE TABLE mysqlsink (
		id varchar
		,name varchar
		,age  varchar
	) 
	with (
	    'connector' = 'print'
	)
""")
t_env.sql_update("insert into mysqlsink select id,name,age from mySource ,LATERAL TABLE(splitStr(data)) as T(name, age)")

t_env.execute("qwe")

菜到抠脚的cxy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
1.11.0 pyflink udtf自定义函数

官网地址https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/python/python_udfs.htmlfrom pyflink.datastream import StreamExecutionEnvironmentfrom pyflink.table import StreamTableEnvironment, DataTypesfrom pyflink.table.descriptors impor.
复制链接

扫一扫