特点
参数是df.series类型 , 输出也是 , 输入行数和输出行数相同
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment, DataTypes, ScalarFunction
from pyflink.table.descriptors import Schema, OldCsv, FileSystem, Kafka, Json
from pyflink.table.udf import udf
import pandas as pd
import numpy as np
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
t_env = StreamTableEnvironment.create(env)
t_env.get_config().get_configuration().set_string("python.fn-execution.memory.managed", 'true')
t_env.get_config().get_configuration().set_string("python.fn-execution.arrow.batch.size", '2')
#1
class Add(ScalarFunction):
def eval(self, i, j):
df = pd.DataFrame({'i': i, 'j': j})
df["res"] = df["i"] + df["j"]
return df['res']
add = udf(Add(), [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas")
t_env.register_function("add", add)
#2
# add = udf(lambda i, j: i + j, [DataTypes.BIGINT(), DataTypes.BIGINT()], DataTypes.BIGINT(), udf_type="pandas")
# t_env.register_function("add", add)
t_env.sql_update("""
CREATE TABLE mySource (
a bigint,
b bigint
) WITH (
'connector' = 'kafka',
'topic' = 'mytesttopic',
'properties.bootstrap.servers' = '172.17.0.2:9092',
'properties.group.id' = 'flink-test-cxy',
'scan.startup.mode' = 'latest-offset',
'format' = 'json'
)
""")
t_env.sql_update("""
CREATE TABLE mySink (
a bigint,
b bigint
) WITH (
'connector' = 'print'
)
""")
t_env.sql_update("insert into mySink select a, add(a,b) from mySource")
t_env.execute("job")