PyFlink
介绍
环境准备
python -m pip install apache-flink
需要下载很多其他的依赖–网络环境好的话需要2小时左右
入门案例
from pyflink.common.serialization import SimpleStringEncoder
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import StreamingFileSink
def tutorial():
# env
env = StreamExecutionEnvironment.get_execution_environment()
env.set_parallelism(1)
# source
ds = env.from_collection(
collection=[(1, 'aaa'), (2, 'bbb')],
type_info=Types.ROW([Types.INT(), Types.STRING()]))
# sink
ds.add_sink(StreamingFileSink
.for_row_format('./tmp/output', SimpleStringEncoder())
.build())
# excute
env.execute("tutorial_job")
if __name__ == '__main__':
tutorial()
from pyflink.dataset import ExecutionEnvironment
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment
from pyflink.table.descriptors import Schema, OldCsv, FileSystem
from pyflink.table.expressions import lit
#env
exec_env = ExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
t_config = TableConfig()
t_env = BatchTableEnvironment.create(exec_env, t_config)
# #指定source
# t_env.connect(FileSystem().path('/tmp/input')) \
# .with_format(OldCsv()
# .field('word', DataTypes.STRING())) \
# .with_schema(Schema()
# .field('word', DataTypes.STRING())) \
# .create_temporary_table('mySource')
#
# #指定sink
# t_env.connect(FileSystem().path('/tmp/output')) \
# .with_format(OldCsv()
# .field_delimiter('\t')
# .field('word', DataTypes.STRING())
# .field('count', DataTypes.BIGINT())) \
# .with_schema(Schema()
# .field('word', DataTypes.STRING())
# .field('count', DataTypes.BIGINT())) \
# .create_temporary_table('mySink')
my_source_ddl = """
create table mySource (
word VARCHAR
) with (
'connector' = 'filesystem',
'format' = 'csv',
'path' = '/tmp/input'
)
"""
my_sink_ddl = """
create table mySink (
word VARCHAR,
`count` BIGINT
) with (
'connector' = 'filesystem',
'format' = 'csv',
'path' = '/tmp/output'
)
"""
t_env.sql_update(my_source_ddl)
t_env.sql_update(my_sink_ddl)
#source
tab = t_env.from_path('mySource')
#transformation
tab.group_by(tab.word) \
.select(tab.word, lit(1).count) \
.execute_insert('mySink').wait() #执行sink/execute