Apache avro 转 Confluent avro

最新推荐文章于 2024-01-24 11:18:30 发布

永远相信神话

最新推荐文章于 2024-01-24 11:18:30 发布

阅读量126

点赞数

文章标签： apache kafka 分布式

本文链接：https://blog.csdn.net/li93675/article/details/129896849

版权

import io.confluent.kafka.serializers.{KafkaAvroSerializer, AbstractKafkaAvroSerDeConfig, AvroSerializer}
import org.apache.avro.Schema
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}

val schemaRegistryUrl = "http://localhost:8081"
val topic = "my_topic"

// Define the Avro schema for the data
val schema = new Schema.Parser().parse(
  """
    |{
    |   "type": "record",
    |   "name": "my_record",
    |   "fields": [
    |      {"name": "name", "type": "string"},
    |      {"name": "age", "type": "int"}
    |   ]
    |}
  """.stripMargin)

// Define the Kafka producer configuration
val kafkaParams = Map[String, Object](
  "bootstrap.servers" -> "localhost:9092",
  "key.serializer" -> classOf[KafkaAvroSerializer],
  "value.serializer" -> classOf[KafkaAvroSerializer],
  AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> schemaRegistryUrl
)

// Define the data schema
val dataSchema = new StructType()
  .add(StructField("name", StringType))
  .add(StructField("age", IntegerType))

// Read the streaming data from a source, such as Kafka
val df = spark.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("subscribe", topic)
  .option("startingOffsets", "earliest")
  .load()
  .select(from_avro(col("value"), schema).as("data"))
  .select("data.*")

// Convert the DataFrame to Confluent Avro format
val avroDf = df.select(to_avro(struct("*")).as("value"))

// Define a function to serialize the Avro data
def serializeAvro(row: org.apache.spark.sql.Row): Array[Byte] = {
  val avroData = row.getAs[Array[Byte]]("value")
  val serializer = new AvroSerializer(schemaRegistryUrl)
  serializer.serialize(topic, avroData)
}

// Write the Avro data to Kafka
val query: StreamingQuery = avroDf.writeStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("topic", topic)
  .option("checkpointLocation", "/tmp/checkpoints")
  .outputMode(OutputMode.Append())
  .foreach((row: org.apache.spark.sql.Row) => {
    val avroData = serializeAvro(row)
    val producer = new KafkaProducer[String, Array[Byte]](kafkaParams.asJava)
    val message = new ProducerRecord[String, Array[Byte]](topic, avroData)
    producer.send(message)
    producer.close()
  })
  .start()

query.awaitTermination()