一、概述
工业物联网数据通过mqtt协议发送到emqtt,kafka订阅emqtt数据,sparkstreaming消费kafka数据和原始留存在oracle的信息表关联计算。
二、demo示例
package streamTest
import java.util.concurrent.Future
import java.util.{Date, Properties}
import com.google.gson.Gson
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
object KafkaStreamTest {
def main(args: Array[String]): Unit = {
val property = new Properties()
val url = "jdbc:oracle:thin:@//xxx:1634/GPS"
property.put("user","GPS2")
property.put("password","123456")
val conf = new SparkConf().setAppName("kafkaStreamTest").set("spark.driver.allowMultipleContexts","true")
val ssc = new StreamingContext(conf, Durations.seconds(1))
val sparkSession = SparkSession.builder().appName("kafkaStreamTest").enableHiveSupport().getOrCreate()
val rfrunDF=sparkSession.read.jdbc(url,"t_rfrun",property)
val driverDF=sparkSession.read.jdbc(url,"t_driver",property)
val msgDF=rfrunDF.join(driverDF,rfrunDF("WORKNUM")===driverDF("WORKNUM"),"inner")
.select(rfrunDF("equipnum"),rfrunDF("worknum"),rfrunDF("worktime"),rfrunDF("unworktime"),driverDF("name"),driverDF("remark2"))
msgDF.persist()
val topics = Array("pocGPS01")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "xxx:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
val recieveStream=kafkaStream
.filter(_.value().length>0)
.filter(_.value().nonEmpty)
.map(x=>handleJson2CaseClass(x.value()))
.mapPartitions(iter=>{
iter.map(x=>{
var distance=getDistance(x.jd1,x.wd1,x.jd2,x.wd2)
(x.num1,x.num2,distance,x.alertTime)
})
})
val crashSchema=StructType(List(
StructField("num1",StringType,false),
StructField("num2",StringType,false),
StructField("distance",DoubleType,false),
StructField("alterTime",DateType,false)))
val resultWorker=recieveStream.transform(rdd=>{
val crash=rdd.map(x=>Row(x._1,x._2,x._3,x._4))
val crashDF=sparkSession.createDataFrame(crash,crashSchema)
val result=crashDF.filter("distance<=0.04")
.join(msgDF,crashDF("num1")===msgDF("equipnum"),"left_outer")
.filter("alterTime <= unworktime and alterTime >= worktime")
.select(msgDF("worknum"),msgDF("name"),msgDF("remark2"),msgDF("worktime"),msgDF("unworktime"),crashDF("alterTime"))
.toDF().rdd
result
})
val streamAlertWindowRdd1=recieveStream.filter(_._3<0.04).countByWindow(Seconds(180),Seconds(30))
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", "100.69.149.210:9092")
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p.setProperty("acks","0")
p.setProperty("buffer.memory","102400")
p.setProperty("batch.size","1000")
p
}
ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
resultWorker.foreachRDD(rdd => {
if (!rdd.isEmpty) {
rdd.foreach(record => {
kafkaProducer.value.send("ssc_test_1",record.toString())
// do something else
})
}
})
streamAlertWindowRdd1.foreachRDD(rdd => {
if (!rdd.isEmpty) {
rdd.foreach(record => {
kafkaProducer.value.send("ssc_test_window_1","3分钟警次数:"+record.toString())
// do something else
})
}
})
ssc.start()
ssc.awaitTermination()
}
case class CrashAlert(num1 :String,num2 :String,tp: Int,status: Int,jd1: Double,wd1: Double,jd2: Double,wd2: Double,alertTime: Date)
def handleJson2CaseClass(jsonStr: String): CrashAlert = {
val gson = new Gson()
gson.fromJson(jsonStr, classOf[CrashAlert])
}
def getDistance(jd1: Double,wd1: Double,jd2: Double,wd2: Double): Double={
if (jd1 != 0 && wd1 != 0 && jd2 != 0 && wd2 != 0) {
val R = 6378.137
val radLat1 = jd1 * Math.PI / 180
val radLat2 = jd2 * Math.PI / 180
val a = radLat1 - radLat2
val b = wd1 * Math.PI / 180 - wd2 * Math.PI / 180
val s = 2 * Math.sin(Math.sqrt(Math.pow(Math.sin(a / 2), 2) + Math.cos(radLat1) * Math.cos(radLat2) * Math.pow(Math.sin(b / 2), 2)))
//BigDecimal.double2bigDecimal(s * R).setScale(2, BigDecimal.RoundingMode.HALF_UP)
double2Double(s * R)
} else {
//BigDecimal.double2bigDecimal(0).setScale(2, BigDecimal.RoundingMode.HALF_UP)
double2Double(0)
}
}
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
lazy val producer = createProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, key, value))
def send(topic: String, value: V): Future[RecordMetadata] =
producer.send(new ProducerRecord[K, V](topic, value))
}
object KafkaSink {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
// Ensure that, on executor JVM shutdown, the Kafka producer sends
// any buffered messages to Kafka before shutting down.
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
}
maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.cmft</groupId>
<artifactId>testSpark</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>cmhk.mirror</id>
<name>cmhk mirror.</name>
<url>xxx</url>
</repository>
<repository>
<id>nexus-cmft</id>
<name>cmft repository</name>
<url>http:/xxx/</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>nexus-cmft</id>
<name>cmft repository</name>
<url>xxx</url>
</pluginRepository>
</pluginRepositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spark.version>2.3.2</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0-cdh5.12.1</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0-cdh5.12.1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.28</version>
</dependency>
<!-- Spark dependency -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.tachyonproject</groupId>
<artifactId>tachyon-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.curator</groupId>
<artifactId>curator-recipes</artifactId>
</exclusion>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>
json-lib
</artifactId>
<version>2.3</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-core_2.10</artifactId>
<version>3.2.10</version>
</dependency>
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.10</artifactId>
<version>3.2.10</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<configuration>
<recompileMode>modified-only</recompileMode>
</configuration>
<executions>
<execution>
<id>main-scalac</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
<directory>target</directory>
<outputDirectory>target/classes</outputDirectory>
<testOutputDirectory>target/test-classes</testOutputDirectory>
<sourceDirectory>src</sourceDirectory>
</build>
</project>