kafka2mysql

该博客展示了如何使用Apache Flink的Python API从Kafka源读取JSON数据,并将数据导入到MySQL数据库中。详细步骤包括设置执行环境、定义数据源和接收器的DDL、注册源和接收器表、以及执行Flink作业。同时提供了数据生成脚本,用于模拟Kafka数据流。
摘要由CSDN通过智能技术生成
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import EnvironmentSettings, StreamTableEnvironment, DataTypes


def kafka_to_mysql():
    """
    从Kafka Source读取Json数据,然后导入到Mysql。{"msg": "welcome flink users..."}
    """
    settings = EnvironmentSettings.new_instance().in_streaming_mode().use_blink_planner().build()
    env = StreamExecutionEnvironment.get_execution_environment()
    t_env = StreamTableEnvironment.create(stream_execution_environment=env, environment_settings=settings)
    t_env.get_config().get_configuration().set_boolean("python.fn-execution.memory.managed", True)

    """
    
        JARS_DIR=/Users/youxuehu/IdeaProjects/springboot-demo/know_how_know_why-master/khkw/No38_flink_python_env/myJars/
        wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.11.1/flink-json-1.11.1.jar; \
        wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka-0.11_2.11/1.11.1/flink-sql-connector-kafka-0.11_2.11-1.11.1.jar; \
        wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc_2.11/1.11.1/flink-connector-jdbc_2.11-1.11.1.jar; \
        wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/5.1.40/mysql-connector-java-5.1.40.jar; \
        wget -P ${JARS_DIR} https://repo.maven.apache.org/maven2/org/apache/flink/flink-table-common/1.11.1/flink-table-common-1.11.1.jar ; \
    
    """

    base_dir = "file:Users/youxuehu/IdeaProjects/springboot-demo/know_how_know_why-master/khkw/No38_flink_python_env/myJars/"

    kafka_jar = f"{base_dir}flink-sql-connector-kafka-0.11_2.11-1.11.1.jar"
    jdbc_jar = f"{base_dir}flink-connector-jdbc_2.11-1.11.1.jar"
    mysql_jar = f"{base_dir}mysql-connector-java-5.1.40.jar"
    json_format_jar = f"{base_dir}flink-json-1.11.1.jar"
    table_common_jar = f"{base_dir}flink-table-common-1.11.1.jar"

    jar_seq = [kafka_jar, jdbc_jar, mysql_jar, json_format_jar, table_common_jar]
    jars = ";".join(jar_seq)

    t_env.get_config().get_configuration().set_string("pipeline.jars",jars)

    source_ddl = """
                    CREATE TABLE kafka_source (
                        createTime STRING,
                        orderId STRING,
                        payAmount STRING,
                        payPlatform STRING,
                        provinceId STRING
                    ) WITH (
                        'connector' = 'kafka-0.11',
                        'topic' = 'cdn-log',
                        'properties.bootstrap.servers' = 'localhost:9092',
                        'format' = 'json',
                        'scan.startup.mode' = 'latest-offset'
                    )
                    """

    sink_ddl = """
                  CREATE TABLE mysql_sink (
                    createTime STRING,
                    orderId STRING,
                    payAmount STRING,
                    payPlatform STRING,
                    provinceId STRING
                ) WITH (
                   'connector' = 'jdbc',
                   'url' = 'jdbc:mysql://localhost:3306/flinkdb?characterEncoding=utf-8&useSSL=false',
                   'table-name' = 'cdn_log',
                   'username' = 'root',
                   'password' = '123456',
                   'sink.buffer-flush.max-rows' = '1'
                )
        """

    # 注册source和sink
    t_env.execute_sql(source_ddl)
    t_env.execute_sql(sink_ddl)

    # 数据提取
    tab = t_env.from_path("kafka_source")
    # 这里我们暂时先使用 标注了 deprecated 的API, 因为新的异步提交测试有待改进...
    tab.insert_into("mysql_sink")
    # 执行作业
    t_env.execute("kafka_to_mysql")

if __name__ == '__main__':
    kafka_to_mysql()

################################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
################################################################################

import random
import time, calendar
from random import randint
from kafka import KafkaProducer
from json import dumps
from time import sleep

'''
1. python3.6 -m pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 本地
2. Copy: cp -rf /Users/jincheng/work/knowhkw/khkw/No40-log-example-python/* ~/flinkDeploy/
3. 启动所有环境 docker-compose up -d
4. http://localhost:4000/#/overview 
5. 测试数据发送,启动kafka消费,
  cd /Users/jincheng/work/knowhkw/khkw/No40-log-example-python
  docker-compose exec kafka kafka-console-consumer.sh --bootstrap-server kafka:9092 --topic payment_msg 
5. 发送数据: 
    cd /Users/jincheng/work/knowhkw/khkw/No40-log-example-python
    安装依赖:
    docker-compose exec jobmanager python -m pip install -r /opt/flinkDeploy/requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
    启动:
    docker-compose exec jobmanager python /opt/flinkDeploy/data_gen.py
    如果能够成数据发送,那么我们就停止上面的消息消费,启动作业。
6. 启动作业。
cd /Users/jincheng/work/knowhkw/khkw/No40-log-example-python
docker-compose exec jobmanager flink run -py  /opt/flinkDeploy/log_example.py

7. 启动数据生成: docker-compose exec jobmanager python /opt/flinkDeploy/data_gen.py
8. 查看作业运行。
'''


def write_data():
    data_cnt = 20000
    order_id = calendar.timegm(time.gmtime())
    max_price = 1000

    topic = "cdn-log"
    producer = KafkaProducer(bootstrap_servers=['localhost:9092'],
                             value_serializer=lambda x: dumps(x).encode('utf-8'))

    for i in range(data_cnt):
        ts = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        rd = random.random()
        order_id += 1
        pay_amount = max_price * rd
        pay_platform = 0 if random.random() < 0.9 else 1
        province_id = randint(0, 6)
        cur_data = {"createTime": ts, "orderId": order_id, "payAmount": pay_amount, "payPlatform": pay_platform, "provinceId": province_id}
        print(cur_data)
        producer.send(topic, value=cur_data)
        sleep(0.5)


if __name__ == '__main__':
    write_data()

create table cdn_log(createTime varchar(255),orderId varchar(255),payAmount varchar(255),payPlatform varchar(255),provinceId varchar(255));
docker-compose exec kafka kafka-console-producer.sh --broker-list localhost:9092 --topic cdn-log
docker-compose exec kafka kafka-console-consumer.sh --bootstrap-server kafka:9092 --topic cdn-log
version: '2'
services:
   jobmanager:
     image: khkw/flink:1.11.1
     volumes:
       - ~/flinkDeploy:/opt/flinkDeploy
     hostname: "jobmanager"
     expose:
       - "6123"
     ports:
       - "4000:4000"
     command: jobmanager
     environment:
       - JOB_MANAGER_RPC_ADDRESS=jobmanager
   taskmanager:
     image: khkw/flink:1.11.1
     volumes:
       - ~/flinkDeploy:/opt/flinkDeploy
     expose:
       - "6121"
       - "6122"
     depends_on:
       - jobmanager
     command: taskmanager
     links:
       - jobmanager:jobmanager
     environment:
       - JOB_MANAGER_RPC_ADDRESS=jobmanager
   zookeeper:
    container_name: zookeeper
    image: zookeeper:3.6.1
    ports:
      - "2181:2181"
   kafka:
    container_name: kafka
    image: wurstmeister/kafka:2.12-2.5.0
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    ports:
      - "9092:9092"
    depends_on:
      - zookeeper
    environment:
      HOSTNAME_COMMAND: "route -n | awk '/UG[ \t]/{print $$2}'"
      KAFKA_CREATE_TOPICS: "payment_msg:1:1"
      KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
      KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://127.0.0.1:9092
      KAFKA_LISTENERS: PLAINTEXT://0.0.0.0:9092
   mysql:
    image: mysql:5.7
    container_name: mysql
    volumes:
      - ~/mysql/data/db:/var/lib/mysql/
      - ~/mysql/mysql-3346.sock:/var/run/mysql.sock
      - ~/mysql/data/conf:/etc/mysql/conf.d
    ports:
      - 3306:3306
    command:
      --default-authentication-plugin=mysql_native_password
      --lower_case_table_names=1
    environment:
      MYSQL_ROOT_PASSWORD: 123456
      TZ: Asia/Shanghai

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值