应用场景是分析用户使用手机App的行为,描述如下所示:手机客户端会收集用户的行为事件(我们以点击事件为例),
将数据发送到数据服务器,我们假设这里直接进入到Kafka消息队列后端的实时服务会从Kafka消费数据,
将数据读出来并进行实时分析,这里选择Spark Streaming,因为Spark Streaming提供了与Kafka整合的内置支持经过Spark Streaming实时计算程序分析,
将结果写入Redis,可以实时获取用户的行为数据,并可以导出进行离线综合统计分析
系统所需jar包配置:
<properties>
<start-class>com.intertid.oauth.startup.ServerStartup</start-class>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<java.version>1.8</java.version>
<commons.version>3.4</commons.version>
<org.apache.spark-version>2.2.0</org.apache.spark-version>
<hadoop.version>2.8.1</hadoop.version>
<scala-library.version>2.11.8</scala-library.version>
<scala.version>2.11</scala.version>
<akka.version>2.5.1</akka.version>
<akka.http.version>10.0.5</akka.http.version>
<io.circe.version>0.8.0</io.circe.version>
</properties>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<exclusions>
<exclusion>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version}</artifactId>
<version>${org.apache.spark-version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version}</artifactId>
<version>${org.apache.spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${org.apache.spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${org.apache.spark-version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_${scala.version}</artifactId>
<version>${org.apache.spark-version}</version>
</dependency>
<!-- akka相关 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala-library.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-actor_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-remote_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-cluster_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-contrib_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-testkit_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-slf4j_${scala.version}</artifactId>
<version>${akka.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-http_${scala.version}</artifactId>
<version>${akka.http.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-http-core_${scala.version}</artifactId>
<version>${akka.http.version}</version>
</dependency>
<dependency>
<groupId>com.typesafe.akka</groupId>
<artifactId>akka-http-testkit_${scala.version}</artifactId>
<version>${akka.http.version}</version>
</dependency>
<dependency>
<groupId>io.circe</groupId>
<artifactId>circe-core_${scala.version}</artifactId>
<version>${io.circe.version}</version>
</dependency>
<dependency>
<groupId>io.circe</groupId>
<artifactId>circe-generic_${scala.version}</artifactId>
<version>${io.circe.version}</version>
</dependency>
<dependency>
<groupId>io.circe</groupId>
<artifactId>circe-jawn_${scala.version}</artifactId>
<version>${io.circe.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
<!--<dependency>
<groupId>org.slf4j</groupId>
<artifactId>log4j-over-slf4j</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.5</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.6.4</version>
</dependency>-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.1.39</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
</dependencies>
kafka消息处理通用工具类编写
class KafkaMsgProducer[K, V](buildProducer:() => KafkaProducer[K, V]) extends Serializable {
lazy val producer = buildProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] = producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): Future[RecordMetadata] = producer.send(new ProducerRecord[K, V](topic, value))
}
object KafkaMsgProducer {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaMsgProducer[K, V] = {
val createProducerCallback= () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook{
producer.close()
}
producer
}
new KafkaMsgProducer[K, V](createProducerCallback)
}
def apply[K, V](config: java.util.Properties): KafkaMsgProducer[K, V] = this.apply(config.toMap)
}
Kafka Producer模拟程序,用来模拟向Kafka实时写入用户行为的事件数据,数据是JSON格式,示例如下:
{"click_count":6,"event_time":"1529030354244","ip":"123.235.242.27","os_type":"Android","uid":"4A4D769EB9679C054DE81B973ED5D768"}
object BehaviorKafkaDataServer {
private val users = Array(
"4A4D769EB9679C054DE81B973ED5D768", "8dfeb5aaafc027d89349ac9a20b3930f",
"011BBF43B89BFBF266C865DF0397AA71", "f2a8474bf7bd94f0aabbd4cdd2c06dcf",
"068b746ed4620d25e26055a9f804385f", "97edfc08311c70143401745a03a50706",
"d7f141563005d1b5d0d3dd30138f3f62", "c8ee90aade1671a21336c721512b817a",
"d7f141563005d1b5d0d3dd30138f4f62", "c8ee90aade1671a21336c721512b837a",
"d7f141563005d1b5d0d3dd30138f5f62", "c8ee90aade1671a21336c721512b847a",
"6b67c8c700427dee7552f81f3228c927", "a95f22eabc4fd4b580c011a3161a9d9d")
private val random = new Random()
private var pointer = -1
def randomUser() : String = {
pointer = pointer + 1
if(pointer >= users.length) {
pointer = 0
users(pointer)
} else {
users(pointer)
}
}
def click() : Int = {
random.nextInt(10)
}
// bin/kafka-topics.sh --zookeeper master:2181,slave01:2181,slave02:2181/kafka --create --topic user_events --replication-factor 2 --partitions 2
// bin/kafka-topics.sh --zookeeper master:2181,slave01:2181,slave02:2181/kafka --list
// bin/kafka-topics.sh --zookeeper master:2181,slave01:2181,slave02:2181/kafka --describe user_events
// bin/kafka-console-consumer.sh --zookeeper master:2181,slave01:2181,slave02:2181/kafka --topic test_json_basis_event --from-beginning
def main(args: Array[String]): Unit = {
val p = new Properties()
p.setProperty("bootstrap.servers", brokers)
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
val kafkaMsgProducer = KafkaMsgProducer[String, String](p)
while(true) {
val event = new JSONObject()
event.put("uid", randomUser)
event.put("event_time", System.currentTimeMillis.toString)
event.put("os_type", "Android")
event.put("ip", IpUtil.getRandomIp)
event.put("click_count", click)
kafkaMsgProducer.send(topic, event.toString)
println("己发消息: " + event)
Thread.sleep(5000)
}
}
}
spark streaming接收kafka数据分析后将结果写入redis,代码如下:
def main(args: Array[String]): Unit = {
var masterUrl = "local[1]"
if (args.length > 0) {
masterUrl = args(0)
}
val conf = new SparkConf().setMaster(masterUrl).setAppName("UserClickCountStat")
val ssc = new StreamingContext(conf, Seconds(5))
val topics = Set(topic)
val kafkaParams = Map[String, Object](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
val dbIndex = 1
val clickHashKey = "app::users::click"
val kafkaStream = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams))
val events = kafkaStream.flatMap(line => {
val data = JSON.parseObject(line.value())
Some(data)
})
val userClicks = events.map(x => (x.getString("uid"), x.getInteger("click_count"))).reduceByKey(_ + _)
userClicks.foreachRDD(rdd => {
rdd.foreachPartition(partitionOfRecords => {
partitionOfRecords.foreach(pair => {
val uid = pair._1
val clickCount = pair._2.toString
val jedis = RedisClient.pool.getResource
jedis.select(dbIndex)
jedis.hincrBy(clickHashKey, uid, java.lang.Long.valueOf(clickCount))
RedisClient.pool.returnResource(jedis)
})
})
})
ssc.start()
ssc.awaitTermination()
数据分析参考: 简单之美