吼吼,第一次用scala写,虽然是对着抄,但磕磕绊绊中还是运行成功啦~
配置文件:
<properties>
<spark.version>2.2.0</spark.version>
<scala.version>2.11</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
scala发送消息到 Kafka
import java.util.concurrent.Future
import java.util.{Properties, UUID}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.StringSerializer
object Producer {
def main(args: Array[String]): Unit = {
val props = new Properties()
// 配置kafka集群地址和端口,注意若使用如下,则需要在C:\Windows\System32\drivers\etc\hosts文件中配置ip和映射。否则配置ip+端口
props.setProperty("bootstrap.servers", "cbp3.chinaoly.com:6667,cbp4.chinaoly.com:6667,cbp5.chinaoly.com:6667,cbp6.chinaoly.com:6667,cbp7.chinaoly.com:6667,cbp8.chinaoly.com:6667")
//传输的格式
props.setProperty("key.serializer","org.apache.kafka.common.serialization.StringSerializer")
props.setProperty("value.serializer" ,classOf[StringSerializer].getName)
//主要用来做应答的,默认为1:
//0 客户端将数据发送到kafka集群,不会等待集群的应答
//1 客户端将数据发送到kafka集群,会等待leader的应答,leader不会等待follower
//-1/all 客户端将数据发送到kafka集群,learder会等到follow应答,leader client应答
//props.setProperty("acks",)
val producerClient = new KafkaProducer[String,String](props)
val record = new ProducerRecord[String,String]("topic-ztLogInfo",0,
UUID.randomUUID().toString,"hello")
val result: Future[RecordMetadata] =producerClient.send(record)
while (!result.isDone){
print(result.get())
}
producerClient.close()
}
}
scala接收kafka数据
import java.util
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
import org.apache.kafka.common.serialization.StringDeserializer
object Consumer {
def main(args: Array[String]): Unit = {
val configs = new util.HashMap[String,AnyRef]()
configs.put("bootstrap.servers","cbp3.chinaoly.com:6667,cbp4.chinaoly.com:6667,cbp5.chinaoly.com:6667,cbp6.chinaoly.com:6667,cbp7.chinaoly.com:6667,cbp8.chinaoly.com:6667")
configs.put("key.deserializer",classOf[StringDeserializer].getName)
configs.put("value.deserializer",classOf[StringDeserializer].getName)
//消费者的groupid
configs.put("group.id","9527")
//[latest,earliest] : 默认latest,consumer启动连上kafka后,只能看到这个时间点后的数据,之前的看不到
//earliest:始终从offiset = 0的位置开始消费数据
configs.put("auto.offset.reset","earliest")
//是否要提交消费数据的偏移量offset
configs.put("enable.auto.commit","true")
val consumer = new KafkaConsumer[String,String](configs)
consumer.subscribe(util.Arrays.asList("topic-ztLogInfo"))
while (true){
//设置拉取时间,获取数据
val records:ConsumerRecords[String,String] =consumer.poll(3000)
println(" == count : "+records.count())
val recordIter:util.Iterator[ConsumerRecord[String,String]] = records.iterator()
while(recordIter.hasNext){
val record = recordIter.next()
println(" == value : "+record.value())
}
}
consumer.close()
}
}
第二版:
spark-streaming接收kafka数据,存入Es
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.elasticsearch.spark.rdd.EsSpark
object LogReceive {
def main(args: Array[String]): Unit = {
// ============================ kafka读取数据
val conf = new SparkConf().setAppName("Kafka_director")
.set("spark.streaming.kafka.consumer.poll.ms", "30000")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds("3".toInt))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> ("cbp3.chinaoly.com:6667,cbp4.chinaoly.com:6667,cbp5.chinaoly.com:6667," +
"cbp6.chinaoly.com:6667,cbp7.chinaoly.com:6667,cbp8.chinaoly.com:6667"),
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "9527",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (true: java.lang.Boolean)
)
var topics = Array("topic-ztLogInfo")
var stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
println(" === begin")
var value: DStream[String] = stream.map(record => (record.value))
value.print()
println(" === end ")
// ============================ 计算处理
// ============================ 存入ES
val esConf = Map(
("es.nodes" , "192.168.52.37"), // es节点
("es.port" , "9296"),//es端口
("es.resource" , "t_log_info/logInfo")//es索引
)
value.foreachRDD(rdd =>{
EsSpark.saveJsonToEs(rdd,esConf)
})
// ============================ 关闭
ssc.start()
ssc.awaitTermination()
}
}