import io.confluent.kafka.serializers.{KafkaAvroSerializer, AbstractKafkaAvroSerDeConfig, AvroSerializer}
import org.apache.avro.Schema
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
val schemaRegistryUrl = "http://localhost:8081"
val topic = "my_topic"
// Define the Avro schema for the data
val schema = new Schema.Parser().parse(
"""
|{
| "type": "record",
| "name": "my_record",
| "fields": [
| {"name": "name", "type": "string"},
| {"name": "age", "type": "int"}
| ]
|}
""".stripMargin)
// Define the Kafka producer configuration
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.serializer" -> classOf[KafkaAvroSerializer],
"value.serializer" -> classOf[KafkaAvroSerializer],
AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG -> schemaRegistryUrl
)
// Define the data schema
val dataSchema = new StructType()
.add(StructField("name", StringType))
.add(StructField("age", IntegerType))
// Read the streaming data from a source, such as Kafka
val df = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", topic)
.option("startingOffsets", "earliest")
.load()
.select(from_avro(col("value"), schema).as("data"))
.select("data.*")
// Convert the DataFrame to Confluent Avro format
val avroDf = df.select(to_avro(struct("*")).as("value"))
// Define a function to serialize the Avro data
def serializeAvro(row: org.apache.spark.sql.Row): Array[Byte] = {
val avroData = row.getAs[Array[Byte]]("value")
val serializer = new AvroSerializer(schemaRegistryUrl)
serializer.serialize(topic, avroData)
}
// Write the Avro data to Kafka
val query: StreamingQuery = avroDf.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("topic", topic)
.option("checkpointLocation", "/tmp/checkpoints")
.outputMode(OutputMode.Append())
.foreach((row: org.apache.spark.sql.Row) => {
val avroData = serializeAvro(row)
val producer = new KafkaProducer[String, Array[Byte]](kafkaParams.asJava)
val message = new ProducerRecord[String, Array[Byte]](topic, avroData)
producer.send(message)
producer.close()
})
.start()
query.awaitTermination()
Apache avro 转 Confluent avro
最新推荐文章于 2024-01-24 11:18:30 发布