1.11.0 pyflink Vectorized udf自定义向量函数

特点

Vectorized Python scalar functions take pandas.Series as the inputs and return a pandas.Series of the same length as the output.

参数是df.series类型 , 输出也是 , 输入行数和输出行数相同

from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes, ScalarFunction
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf
import pandas as pd
import numpy as np
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
t_env.get_config().get_configuration().set_string("python.fn-execution.arrow.batch.size", '2')
#1
class Add(ScalarFunction):
	def eval(self, i, j):
		df = pd.DataFrame({'i': i, 'j': j})
		df["res"] = df["i"] + df["j"]
		return df['res']
add = udf(Add(), [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas")
t_env.register_function("add", add)

#2
# add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas")
# t_env.register_function("add", add)

t_env.sql_update("""
	CREATE TABLE mySource (                                       
		a bigint,                                                    
		b bigint                                                
	) WITH (                                                         
	'connector' = 'kafka',
	'topic' = 'mytesttopic',
	'properties.bootstrap.servers' = '172.17.0.2:9092',
	'properties.group.id' = 'flink-test-cxy',
	'scan.startup.mode' = 'latest-offset',
	'format' = 'json'                                      
	) 
""")
t_env.sql_update("""
	CREATE TABLE mySink (                                       
		a bigint,                                                    
		b bigint                                                
	) WITH (                                                         
	'connector' = 'print'       
	) 
""")
t_env.sql_update("insert into mySink select a, add(a,b) from mySource")
t_env.execute("job")

 

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页