Apache avro 转 Confluent avro

import io.confluent.kafka.serializers.{KafkaAvroSerializer, AbstractKafkaAvroSerDeConfig, AvroSerializer}
import org.apache.avro.Schema
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}

val schemaRegistryUrl = "http://localhost:8081"
val topic = "my_topic"

// Define the Avro schema for the data
val schema = new Schema.Parser().parse(
  """
    |{
    |   "type": "record",
    |   "name": "my_record",
    |   "fields": [
    |      {"name": "name", "type": "string"},
    |      {"name": "age", "type": "int"}
    |   ]
    |}
  """.stripMargin)

// Define the Kafka producer configuration
val kafkaParams = Map[String, Object](
  "bootstrap.servers" -> "localhost:9092",
  "key.serializer" -> classOf[KafkaAvroSerializer],
  "value.serializer" -> classOf[KafkaAvroSerializer],
  AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> schemaRegistryUrl
)

// Define the data schema
val dataSchema = new StructType()
  .add(StructField("name", StringType))
  .add(StructField("age", IntegerType))

// Read the streaming data from a source, such as Kafka
val df = spark.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("subscribe", topic)
  .option("startingOffsets", "earliest")
  .load()
  .select(from_avro(col("value"), schema).as("data"))
  .select("data.*")

// Convert the DataFrame to Confluent Avro format
val avroDf = df.select(to_avro(struct("*")).as("value"))

// Define a function to serialize the Avro data
def serializeAvro(row: org.apache.spark.sql.Row): Array[Byte] = {
  val avroData = row.getAs[Array[Byte]]("value")
  val serializer = new AvroSerializer(schemaRegistryUrl)
  serializer.serialize(topic, avroData)
}

// Write the Avro data to Kafka
val query: StreamingQuery = avroDf.writeStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("topic", topic)
  .option("checkpointLocation", "/tmp/checkpoints")
  .outputMode(OutputMode.Append())
  .foreach((row: org.apache.spark.sql.Row) => {
    val avroData = serializeAvro(row)
    val producer = new KafkaProducer[String, Array[Byte]](kafkaParams.asJava)
    val message = new ProducerRecord[String, Array[Byte]](topic, avroData)
    producer.send(message)
    producer.close()
  })
  .start()

query.awaitTermination()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值