启动kafka
依赖:
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.0</version>
</dependency>
代码:单次统计
package com.grace.count
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object NetworkWordCount {
def main(args: Array[String]) {
/*指定时间间隔为5s*/
val sparkConf = new SparkConf().setAppName("NetworkWordCount")
.setMaster("local[2]")
// new SparkContext()
val ssc = new StreamingContext(sparkConf, Seconds(3))
/*创建文本输入流,并进行词频统计*/
val lines = ssc.socketTextStream("hdp-1", 9999)
lines.flatMap(_.split(" ")).map(x => (x, 1))
.reduceByKey(_ + _).print()
/*启动服务*/
ssc.start()
/*等待服务结束*/
ssc.awaitTermination()
}
}
linux中hdp-1
yum install nc -y
nc -l 9999
hello hello you
结果:
-------------------------------------------
Time: 1574401614000 ms
-------------------------------------------
(hello,2)
(you,1)
代码:累计统计
package com.grace.count
import org.apache.spark.{HashPartitioner, SparkConf}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object NetworkUpdateStateWordCount {
/**
* String : 单词 hello
* Seq[Int] :单词在当前批次出现的次数
* Option[Int] : 历史结果
*/
// val i : Int = 11;
val updateFunc = (iter: Iterator[(String, Seq[Int], Option[Int])]) => {
//iter.flatMap(it=>Some(it._2.sum + it._3.getOrElse(0)).map(x=>(it._1,x)))
iter.flatMap{case(x,y,z)=>Some(y.sum + z.getOrElse(0)).map(m=>(x, m))}
}
def main(args: Array[String]) {
// LoggerLevel.setStreamingLogLevels()
val conf = new SparkConf()
.setMaster("local[2]")
.setAppName("NetworkUpdateStateWordCount")
val ssc = new StreamingContext(conf, Seconds(5))
//做checkpoint 写入共享存储中
ssc.checkpoint("D://numberCount.log")
val lines = ssc.socketTextStream("hdp-1", 9999)
//reduceByKey 结果不累加
//val result = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_+_)
//updateStateByKey结果可以累加但是需要传入一个自定义的累加函数:updateFunc
val results = lines.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunc, new HashPartitioner(ssc.sparkContext.defaultParallelism), true)
results.print()
ssc.start()
ssc.awaitTermination()
}
}
linux中hdp-1
nc -l 9999
hello hello you
hello hello me
结果:
-------------------------------------------
Time: 1574401895000 ms
-------------------------------------------
(hello,4)
(me,1)
(you,1)
foreachRDD
在Linux中运行redis
cd /usr/local/redis/bin
./redis-server redis.conf
依赖:
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
package com.grace.count
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import redis.clients.jedis.Jedis
object NetworkWordCountToRedis {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("NetworkWordCountToRedis").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
/*创建文本输入流,并进行词频统计*/
val lines = ssc.socketTextStream("hdp-1", 9999)
val pairs: DStream[(String, Int)] = lines.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
/*保存数据到Redis*/
pairs.foreachRDD { rdd =>
rdd.foreachPartition { partitionOfRecords =>
var jedis: Jedis = null
try {
jedis = JedisPoolUtil.getConnection
jedis.auth("123456")
partitionOfRecords.foreach(record => jedis.hincrBy("rediswordCount", record._1, record._2))
} catch {
case ex: Exception =>
ex.printStackTrace()
} finally {
if (jedis != null) jedis.close()
}
}
}
ssc.start()
ssc.awaitTermination()
}
}
package com.grace.count;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
public class JedisPoolUtil {
/* 声明为volatile防止指令重排序 */
private static volatile JedisPool jedisPool = null;
private static final String HOST = "hdp-4";
private static final int PORT = 6379;
/* 双重检查锁实现懒汉式单例 */
public static Jedis getConnection() {
if (jedisPool == null) {
synchronized (JedisPoolUtil.class) {
if (jedisPool == null) {
JedisPoolConfig config = new JedisPoolConfig();
config.setMaxTotal(30);
config.setMaxIdle(10);
jedisPool = new JedisPool(config, HOST, PORT);
}
}
}
return jedisPool.getResource();
}
}
linux中hdp-1
nc -l 9999
hello hello you
hello hello me
在redis上查看:
在实际计算时,Spark会将对RDD操作分解为多个Task,Task运行在具体的Worker Node上。在执行之前,Spark会对任务进行闭包,之后闭包被序列化并发送给每个Executor,而Jedis显然是不能被序列化的
SparkStreaming的输出方式:
Spark Streaming支持以下输出操作:
1、print() 在运行流应用程序的driver节点上打印DStream中每个批次的前十个元素。用于开发调试。
2、saveAsTextFiles(prefix, [suffix]) 将DStream的内容保存为文本文件。每个批处理间隔的文件名基于前缀和后缀生成:“prefix-TIME_IN_MS [.suffix]”。
3、saveAsObjectFiles(prefix, [suffix]) 将DStream的内容序列化为Java对象,并保存到SequenceFiles。每个批处理间隔的文件名基于前缀和后缀生成:“prefix-TIME_IN_MS [.suffix]”。
4、saveAsHadoopFiles(prefix, [suffix]) 将DStream的内容保存为Hadoop文件。每个批处理间隔的文件名基于前缀和后缀生成:“prefix-TIME_IN_MS [.suffix]”。
5、foreachRDD(func) 最通用的输出方式,它将函数func应用于从流生成的每个RDD。此函数应将每个RDD中的数据推送到外部系统,例如将RDD保存到文件,或通过网络将其写入数据库。
SparkStreaming2.2.0获取Kafka数据统计wordcount
1,启动kafka
在kafka的安装路径下
bin/kafka-server-start.sh -daemon config/server.properties
2,代码
package com.grace.sparkkafka
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
object SparkSteamingLogAnalysis2 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[3]").setAppName("SparkSteamingLogAnalysis2")
// val tt = args(0).trim.toLong
val ssc = new StreamingContext(conf, Seconds(10))
// val sc = ssc.sparkContext
ssc.checkpoint("D:\\aaa")
val sheData: InputDStream[ConsumerRecord[String, String]] = getKafka(ssc, "animal", "groupId")
val updateFunc = (curVal: Seq[Int], preVal: Option[Int]) => {
//进行数据统计当前值加上之前的值
var total = curVal.sum
//最初的值应该是0
var previous = preVal.getOrElse(0)
//Some 代表最终的但会值
Some(total + previous)
}
//获取kafka中的数据
//处理数据
sheData.foreachRDD { rdds => {
val offsetRanges: Array[OffsetRange] = rdds.asInstanceOf[HasOffsetRanges].offsetRanges
// print(offsetRanges.length + ", " + offsetRanges.toBuffer)
//统计结果
// val result = offsetRanges(3).ma·p(_._2).flatMap(_.split(" ")).map(word=>(word,1)).updateStateByKey(updateFunc).print()
// 方法(rdds)
// print(rdds.collect().toBuffer)
sheData.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
}
val result = sheData.map(_.value()).flatMap(_.split(" ")).map(word=>(word,1)).updateStateByKey(updateFunc).print()
ssc.start()
ssc.awaitTermination()
}
/**
* 获取kafka配置信息
*/
def getKafka(ssc: StreamingContext, topic: String, groupId: String) = {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hdp-2:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "latest",
"fetch.max.wait.ms" -> Integer.valueOf(500),
"enable.auto.commit" -> java.lang.Boolean.valueOf(false)
)
val topics = Array(topic)
val data = KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
data
}
}
aaa.txt
aaa bbb ccc hello you hi you me yes
利用生产者产生数据
[root@hdp-4 ~]# ./apps/kafka_2.12-2.2.0/bin/kafka-console-consumer.sh --bootstrap-server hdp-1:9092,hdp-2:9092,hdp-4:9092 --topic animal --from-beginning
hellio
hi
helo hi you
he
me me two
hi hi hi you me hello why where when
观察结果
-------------------------------------------
Time: 1574836600000 ms
-------------------------------------------
(you,1)
(helo,1)
(hi,2)
-------------------------------------------
Time: 1574836650000 ms
-------------------------------------------
(h,1)
(you,1)
(helo,1)
(hi,2)
-------------------------------------------
Time: 1574836660000 ms
-------------------------------------------
(h,1)
(me,2)
(two,1)
(you,1)
(helo,1)
(hi,2)
-------------------------------------------
Time: 1574836700000 ms
-------------------------------------------
(h,1)
(me,3)
(two,1)
(you,2)
(hello,1)
(helo,1)
(hi,5)
(when,1)
(why,1)
(where,1)