import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.avro.generic.GenericRecord
import io.confluent.kafka.serializers.KafkaAvroSerializer
import org.apache.spark.sql.functions._
import org.apache.spark.sql.avro.functions._
import org.apache.spark.sql.streaming._
val bootstrapServers = "localhost:9092"
val schemaRegistryUrl = "http://localhost:8081"
val topic = "my_topic"
// define schema
val schemaString = """{
"type": "record",
"name": "MyClass",
"fields": [
{"name": "field1", "type": "string"},
{"name": "field2", "type": "int"}
]
}"""
val schema = new Schema.Parser().parse(schemaString)
// create Avro record
val record = new GenericData.Record(schema)
record.put("field1", "value1")
record.put("field2", 123)
// create KafkaAvroSerializer instance
val props = Map(
"bootstrap.servers" -> bootstrapServers,
"schema.registry.url" -> schemaRegistryUrl,
"value.serializer" -> "io.confluent.kafka.serializers.KafkaAvroSerializer"
)
val kafkaAvroSerializer = new KafkaAvroSerializer()
kafkaAvroSerializer.configure(props.asJava, false)
// serialize Avro record as Confluent Avro format
val avroBytes = kafkaAvroSerializer.serialize(topic, record)
// convert DataFrame to Confluent Avro format and write to Kafka
val df = Seq(("value1", 123), ("value2", 456)).toDF("field1", "field2")
val avroDf = df.select(to_avro(struct("*")).alias("value"))
avroDf
.writeStream
.format("kafka")
.option("kafka.bootstrap.servers", bootstrapServers)
.option("topic", topic)
.option("value.serializer", "io.confluent.kafka.serializers.KafkaAvroSerializer")
.option("schema.registry.url", schemaRegistryUrl)
.start()
.awaitTermination()