项目架构为:
1.新建一个maven工程——MySparkstreaming
2.导入pom.xml依赖。各个依赖版本要匹配哦,不然会报错哦~比如会报错AbstractMethodError
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<!--这边spark降版本是因为要和下面的spark-streaming-kafka-0-10_2.11依赖匹配,不然会报错AbstractMethodError.我spark版本为2.4.4-->
<spark.version>2.1.0</spark.version>
<kafka.version>2.0.0</kafka.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka-clients -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<!--创建kafka的DStream数据源需要用到他KafkaUtils-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${kafka.version}</version>
</dependency>
</dependencies>
3.创建一个读特质——ReadTrait
package cn.alisa.mySparkstreaming.services
import java.util.Properties
import org.apache.spark.streaming.dstream. InputDStream
trait ReadTrait[T] {
//java和scala集合之间的隐式转换
import scala.collection.JavaConversions._
def reader(prop:Map[String,Object],tableName:String):InputDStream[T]
def reader(prop:Properties,tableName:String):InputDStream[T]=reader(prop.toMap[String,Object],tableName)
}
4.创建一个写特质——WriteTrait
package cn.alisa.mySparkstreaming.services
import java.util.Properties
import org.apache.spark.streaming.dstream.DStream
trait WriteTrait[T] {
import scala.collection.JavaConversions._
def writer(prop:Map[String,Object],tableName:String,ds:DStream[T]):Unit
def writer(prop:Properties,tableName:String,ds:DStream[T]):Unit=writer(prop.toMap[String,Object],tableName,ds)
}
5.创建一个伴生类伴生对象——KafkaReader,用于实现ReadTrait
package cn.alisa.mySparkstreaming.services.impl
import cn.alisa.mySparkstreaming.services.ReadTrait
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
class KafkaReader(ssc:StreamingContext) extends ReadTrait[ConsumerRecord[String,String]]{
override def reader(prop: Map[String,Object], tableName: String): InputDStream[ConsumerRecord[String,String]] = {
KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](Set(tableName),prop))
}
}
//伴生对象,
// 伴生对象通常会使用apply函数定义伴生类的构造方法。 这样在创建伴生类的对象时就可以省略 new 关键字
object KafkaReader{
def apply(ssc: StreamingContext): KafkaReader = new KafkaReader(ssc)
}
6.创建一个伴生类伴生对象——KafkaSink,用于KafkaWriter广播和发送消息的
package cn.alisa.mySparkstreaming.services.commons
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
//kafkasink用来发消息的
//序列化
class KafkaSink[K,V](producer:()=>KafkaProducer[K,V]) extends Serializable {
//lazy实现延迟加载(懒加载)。只有在被使用时才会执行初始化
//连接序列化问题通过懒加载的方式解决,此代码不会因为每次发送数据时重新建立连接。
lazy val prod: KafkaProducer[K, V] = producer()
def send(topic: String, key: K, value: V) = {
prod.send(new ProducerRecord[K, V](topic, key, value))
}
def send(topic: String, value: V) = {
prod.send(new ProducerRecord[K, V](topic, value))
}
}
object KafkaSink{
//在java和scala之间进行隐式转换,第二个apply中的properties是java中的
import scala.collection.JavaConversions._
def apply[K,V](config:Map[String,Object]) = {
//构建一个匿名函数
val createKafkaProducer=()=>{
val produ = new KafkaProducer[K,V](config)
//调用钩子函数,关闭JVM
sys.addShutdownHook{
produ.close()
}
produ
}
new KafkaSink[K,V](createKafkaProducer)
}
def apply[K,V](config:Properties): KafkaSink[K,V] = apply(config.toMap)
}
7.创建一个伴生类伴生对象——KafkaWriter,用于实现WriteTrait
package cn.alisa.mySparkstreaming.services.impl
import cn.alisa.mySparkstreaming.services.WriteTrait
import cn.alisa.mySparkstreaming.services.commons.KafkaSink
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
class KafkaWriter(ssc:StreamingContext) extends WriteTrait[String]{
override def writer(prop: Map[String, Object], tableName: String, ds: DStream[String]): Unit = {
val bc = ssc.sparkContext.broadcast(KafkaSink[String,String](prop))
ds.foreachRDD(rdd=>rdd.foreachPartition(iter=>{
iter.map(msg=>{
bc.value.send(tableName,msg)
}).foreach(x=>x)
}))
}
}
//伴生对象
object KafkaWriter{
def apply(ssc: StreamingContext): KafkaWriter = new KafkaWriter(ssc)
}
8.创建一个转换特质——TransformTrait,用于转换。
package cn.alisa.mySparkstreaming.services
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
trait TransformTrait[T,V] {
def transform(in:InputDStream[T]):DStream[V]
}
9.创建一个特质Event_Attendees_Trait,用来实现TransformTrait,做具体的转换
package cn.alisa.mySparkstreaming.services.userImpl
import cn.alisa.mySparkstreaming.services.TransformTrait
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
//进来按照读,出去按照写
trait Event_Attendees_Trait extends TransformTrait[ConsumerRecord[String,String],String]{
override def transform(in: InputDStream[ConsumerRecord[String, String]]): DStream[String] = {
in.flatMap(line=>{
var info = line.value().split(",", -1)
//[(123,456,yes),(123,456,yes)......]
var yes = info(1).split(" ").map(us=>(info(0),us,"yes"))
var maybe =info(2).split(" ").map(us=>(info(0),us,"maybe"))
var invited = info(3).split(" ").map(us=>(info(0),us,"invited"))
var no = info(4).split(" ").map(us=>(info(0),us,"no"))
yes++maybe++invited++no
}.filter(_._2.trim()!="").map(tp=>tp.productIterator.mkString(",")))
}
}
10.创建一个特质User_Friends_Trait,用来实现TransformTrait,做具体的转换
package cn.alisa.mySparkstreaming.services.userImpl
import cn.alisa.mySparkstreaming.services.TransformTrait
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
trait User_Friends_Trait extends TransformTrait[ConsumerRecord[String,String],String]{
override def transform(in: InputDStream[ConsumerRecord[String, String]]): DStream[String] = {
in.filter(line=>{
val ln=line.value()
val reg=",$".r
val iter = reg.findAllMatchIn(ln)
!iter.hasNext
}).flatMap(line=>{
val info = line.value().split(",",-1)
info(1).filter(_!="").split(" ").map(friendid=>{
(info(0),friendid)
})
}).map(_.productIterator.mkString(","))
}
}
11.创建一个类——KTKExecutor,用于混入特质,就是把之前的几个特质拼起来
package cn.alisa.mySparkstreaming.services
import cn.alisa.mySparkstreaming.services.impl.{KafkaReader, KafkaWriter}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
//要混入特质
class KTKExecutor(readConf:Map[String,Object],writeConf:Map[String,Object]) {
tran:TransformTrait[ConsumerRecord[String,String],String]=>
def worker(intopic:String,outtopic:String)={
//创建一个Streamingcontext
val sc = new SparkConf().setMaster("local[*]").setAppName("read test")
.set("spark.serializer","org.apache.spark.serializer.KryoSerializer")
.set("spark.streaming.kafka.consumer.poll.ms","10000")
val ssc = new StreamingContext(sc, Seconds(1))
ssc.checkpoint("e:/ck")
//调用kafka数据读取
val kr=new KafkaReader(ssc).reader(readConf,intopic)
//调用混入特质进行数据处理
var ds=tran.transform(kr)
//调用kafka写入topic
new KafkaWriter(ssc).writer(writeConf,outtopic,ds)
ssc.start()
ssc.awaitTermination()
}
}
12.创建测试对象——MyTest
object MyTest {
def main(args: Array[String]): Unit = {
val inParams=Map(
//建立与kafka集群的连接
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.21.130:9092",
//消费组名字
ConsumerConfig.GROUP_ID_CONFIG->"alisa",
//每次最大消费消息数量
ConsumerConfig.MAX_POLL_RECORDS_CONFIG->"1000",
//消费者通过反序列化将kafka收到的字节数组转换成相应的对象
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG->classOf[StringDeserializer],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG->"earliest"
)
val outParams=Map(
ProducerConfig.BOOTSTRAP_SERVERS_CONFIG->"192.168.21.130:9092",
ProducerConfig.ACKS_CONFIG->"1",
ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer],
ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG->classOf[StringSerializer]
)
//动态混入
// (new KTKExecutor(inParams,outParams) with User_Friends_Trait).worker("user_friends_raw","user_friends_ss")
(new KTKExecutor(inParams,outParams) with Event_Attendees_Trait).worker("event_attendees_raw","event_attendees_ss")
}
}