flume配置
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure r1
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
# Describe k1
#a1.sinks.k1.type = logger
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = test14
a1.sinks.k1.kafka.bootstrap.servers = 192.168.1.234:9092,192.168.1.235:9092,192.168.1.236:9092
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.flumeBatchSize = 2
a1.sinks.k1.kafka.producer.linger.ms = 1
a1.sinks.ki.kafka.producer.compression.type = snappy
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 100
a1.channels.c1.transactionCapactiy = 10
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
~~~~~
然后启动一个telnet
spark代码:
def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark: SparkSession = SparkSession.builder() .appName("aa").master("local[2]") .getOrCreate() import spark.implicits._ val line: DataFrame = spark.readStream.format("kafka"). option("kafka.bootstrap.servers","scfl4:9092,scfl5:9092,scfl6:9092") .option("subscribe", "test14") // .option("startingOffsets","earliest") .load() val dataset: Dataset[ String] = line. selectExpr( "CAST(value AS STRING)") .as[String] val wordcount: DataFrame = dataset.flatMap(_.split(" ")).groupBy("value").count() val query: StreamingQuery = wordcount. writeStream. outputMode("complete") .format("console") .start() query.awaitTermination() }
打印到控制台的数据感觉不对,原来是flume采集telnet时 回车键也采集了。