从kafka读取数据,根据消息中的字段进行过滤,过滤后写入kafka。
代码如下:
object ETLDemo {
def main(args: Array[String]): Unit = {
// 1,连接kafka
val spark: SparkSession = getSparkSession
val kafkaSourceDf: DataFrame = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "node1:9092")
.option("subscribe", "stationTopic")
.option("offset", "latest")
.load()
kafkaSourceDf.printSchema()
// 2,过滤数据
import spark.implicits._
val etlDf = kafkaSourceDf.mapPartitions(it => {
it.map(r => new String(r.getAs(1),StandardCharsets.UTF_8))
})
.filter(value =>StringUtils.isNoneBlank(value)&& value.contains("success"))
.map(_ -> "etlTopic")
.toDF("value","topic")
// 3,写回kafka
etlDf.writeStream
.format("kafka")
.outputMode("append")
.option("kafka.bootstrap.servers", "node1:9092")
.option("checkpointLocation","data/output/ck/station")
.start()
.awaitTermination()
}
// 测试用打印
def printDF(df:DataFrame) = {
df.writeStream
.outputMode("append")
.format("console")
.start()
.awaitTermination()
}
}
mock数据的kafka生产者代码:
package spark.structed.stream.etl.demo
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.kafka.common.serialization.StringSerializer
import java.util.{Properties, Random}
object MockStationLog {
def main(args: Array[String]): Unit = {
// 发送Kafka Topic
val props = new Properties()
props.put("bootstrap.servers", "node1:9092")
props.put("acks", "1")
props.put("retries", "3")
props.put("key.serializer", classOf[StringSerializer].getName)
props.put("value.serializer", classOf[StringSerializer].getName)
val producer = new KafkaProducer[String, String](props)
val random = new Random()
val allStatus = Array(
"fail", "busy", "barring", "success", "success", "success",
"success", "success", "success", "success", "success", "success"
)
while (true) {
val callOut: String = "1860000%04d".format(random.nextInt(10000))
val callIn: String = "1890000%04d".format(random.nextInt(10000))
val callStatus: String = allStatus(random.nextInt(allStatus.length))
val callDuration = if ("success".equals(callStatus)) (1 + random.nextInt(10)) * 1000L else 0L
// 随机产生一条基站日志数据
val stationLog: StationLog = StationLog(
"station_" + random.nextInt(10),
callOut,
callIn,
callStatus,
System.currentTimeMillis(),
callDuration
)
println(stationLog.toString)
Thread.sleep(100 + random.nextInt(100))
val record = new ProducerRecord[String, String]("stationTopic", stationLog.toString)
producer.send(record)
}
producer.close() // 关闭连接
}
/**
* 基站通话日志数据
*/
case class StationLog(
stationId: String, //基站标识符ID
callOut: String, //主叫号码
callIn: String, //被叫号码
callStatus: String, //通话状态
callTime: Long, //通话时间
duration: Long //通话时长
) {
override def toString: String = {
s"$stationId,$callOut,$callIn,$callStatus,$callTime,$duration"
}
}
}
kafka创建topic、模拟消费者:
#查看topic信息
/export/server/kafka/bin/kafka-topics.sh --list --zookeeper node1:2181
#删除topic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper node1:2181 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --delete --zookeeper node1:2181 --topic etlTopic
#创建topic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper node1:2181 --replication-factor 1 --partitions 3 --topic stationTopic
/export/server/kafka/bin/kafka-topics.sh --create --zookeeper node1:2181 --replication-factor 1 --partitions 3 --topic etlTopic
#模拟生产者
/export/server/kafka/bin/kafka-console-producer.sh --broker-list node1:9092 --topic stationTopic
/export/server/kafka/bin/kafka-console-producer.sh --broker-list node1:9092 --topic etlTopic
#模拟消费者
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server node1:9092 --topic stationTopic --from-beginning
/export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server node1:9092 --topic etlTopic --from-beginning