写在前面,需要先开启虚拟机,开启zookeeper进程和kafka服务。
命令如下:
zkServer.sh start
kafka-server-start.sh /opt/software/kafka211/config/server.properties
1.新建一个maven工程——mySparkstreaming。
2.配置好pom.xml文件。
windows下安装的spark版本为2.4.4版本,但是在这个例子中,需要降低其版本,否则会报以下错误:
Exception in thread "main" java.lang.AbstractMethodError
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<spark.version>2.1.0</spark.version>
<kafka.version>2.0.0</kafka.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${kafka.version}</version>
</dependency>
</dependencies>
3.新建一个scala文件——ReadKafkaTopic.scala
①现在有一个test主题,内容字段为user,event,invited,timestamp
需求:打印出test主题中的内容。
package cn.alisa.mySparkstreaming
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
//使用SparkStreaming消费kafka的某个主题topic
object ReadKafkaTopic {
def main(args: Array[String]): Unit = {
val sc = new SparkConf().setMaster("local[*]").setAppName("read test")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("e:/ck")
val kafkaParams=Map(
//建立与kafka集群的连接
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.21.130:9092",
//消费组名字
ConsumerConfig.GROUP_ID_CONFIG->"alisa",
//每次最大消费消息数量
ConsumerConfig.MAX_POLL_RECORDS_CONFIG->"500",
//消费者通过反序列化将kafka收到的字节数组转换成相应的对象
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
val ku = KafkaUtils.createDirectStream(ssc,
//本地化策略:executor均匀分布分区
LocationStrategies.PreferConsistent,
//读 test 主题
ConsumerStrategies.Subscribe[String, String](Set("test"), kafkaParams))
//打印出topic主题中的test的内容
ku.foreachRDD(rdd=>{
rdd.foreach(rec=>println(rec.value()))
})
ku.print()
ssc.start()
ssc.awaitTermination()
}
}
运行结果如下:
②现在有一个user_friends_raw主题,字段分别是user,friends
需求:把user_friends_raw中先按逗号分隔开,再将friends按空格分割,遍历出来,形成一个人对应一个朋友
package cn.alisa.mySparkstreaming
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.KafkaProducer
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
//使用SparkStreaming消费kafka的某个主题topic
object ReadKafkaTopic {
def main(args: Array[String]): Unit = {
val sc = new SparkConf().setMaster("local[*]").setAppName("read test")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("e:/ck")
val kafkaParams=Map(
//建立与kafka集群的连接
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.21.130:9092",
//消费组名字
ConsumerConfig.GROUP_ID_CONFIG->"alisa",
//每次最大消费消息数量
ConsumerConfig.MAX_POLL_RECORDS_CONFIG->"500",
//消费者通过反序列化将kafka收到的字节数组转换成相应的对象
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
val ku = KafkaUtils.createDirectStream(ssc,
//本地化策略:executor均匀分布分区
LocationStrategies.PreferConsistent,
//读 user_friends_raw 主题
ConsumerStrategies.Subscribe[String,String](Set("user_friends_raw"),kafkaParams))
ku.filter(ln=>{
var reg=",$".r
!reg.findAllMatchIn(ln.value()).hasNext
}).flatMap(line=>{
val info = line.value().split(",")
info(1).split(" ").map(fid => {
(info(0), fid)
})
}).foreachRDD(rdd=>rdd.foreach(println))
ku.print()
ssc.start()
ssc.awaitTermination()
}
}
运行结果如下:
③现在有一个event_attendees_raw 主题,字段是event,yes,maybe,invited,no
需求:将event_attendees_raw 主题分割成一个eventid userid action
package cn.alisa.mySparkstreaming
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.KafkaProducer
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
//使用SparkStreaming消费kafka的某个主题topic
object ReadKafkaTopic {
def main(args: Array[String]): Unit = {
val sc = new SparkConf().setMaster("local[*]").setAppName("read test")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("e:/ck")
val kafkaParams=Map(
//建立与kafka集群的连接
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.21.130:9092",
//消费组名字
ConsumerConfig.GROUP_ID_CONFIG->"alisa",
//每次最大消费消息数量
ConsumerConfig.MAX_POLL_RECORDS_CONFIG->"500",
//消费者通过反序列化将kafka收到的字节数组转换成相应的对象
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
val ku = KafkaUtils.createDirectStream(ssc,
//本地化策略:executor均匀分布分区
LocationStrategies.PreferConsistent,
//读 event_attendees_raw 主题
ConsumerStrategies.Subscribe[String,String](Set("event_attendees_raw"),kafkaParams))
//将event_attendees_raw 主题分割成一个eventid userid action
ku.flatMap(line=>{
var info = line.value().split(",", -1)
//[(123,456,yes),(123,456,yes)......]
var yes = info(1).filter(_!="").split(" ").map(us=>(info(0),us,"yes"))
var maybe =info(2).filter(_!="").split(" ").map(us=>(info(0),us,"maybe"))
var invited = info(3).filter(_!="").split(" ").map(us=>(info(0),us,"invited"))
var no = info(4).filter(_!="").split(" ").map(us=>(info(0),us,"no"))
yes++maybe++invited++no
}).foreachRDD(rdd=>rdd.foreach(println))
ku.print()
ssc.start()
ssc.awaitTermination()
}
}
运行结果如下: