1.在IDEA新建一个maven项目:
pom.xml
<properties>
<scala.version>2.11.8</scala.version>
<spark.version>2.3</spark.version>
<spark.artifact>2.11</spark.artifact>
<dependency.scope>compile</dependency.scope>
</properties>
<repositories>
<repository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>scala-tools.org</id>
<name>Scala-Tools Maven2 Repository</name>
<url>http://scala-tools.org/repo-releases</url>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-8 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.9.0.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>io.spray</groupId>
<artifactId>spray-json_2.10</artifactId>
<version>1.3.2</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>target/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
<args>
<arg>-target:jvm-1.5</arg>
</args>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<configuration>
<downloadSources>true</downloadSources>
<buildcommands>
<buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
</buildcommands>
<additionalProjectnatures>
<projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
</additionalProjectnatures>
<classpathContainers>
<classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
<classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
</classpathContainers>
</configuration>
</plugin>
</plugins>
</build>
<reporting>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</reporting>
2.新建一个主类
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
import org.apache.spark.streaming.kafka._
object SparkKafkascamer {
def main(args: Array[String]): Unit = {
//设置为本地模式
val sparkConf = new SparkConf().setMaster("local[5]").setAppName("SparkStreamingKafka_direct").set("spark.streaming.backpressure.enabled","true")
//val sparkContext = new SparkContext(sparkConf)
sparkConf.set("spark.streaming.backpressure.enabled","true")
//运行时间间隔为 5s
val ssc = new StreamingContext(sparkConf,Seconds(5))
val topics = Set("sptest")
//接受 node1 的 kafka 传过来的数据,指定 groupid
val kafkaParams = Map("bootstrap.servers"->"node1.etonedu.cn:6667","group.id"->"sptest")
//调用 createDirectStream 方法
val messages = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
val lines = messages.map(_._2)
//调用指定的 SparkConnectionScalaNew.runmain 方法
lines.foreachRDD(
rdd => {
val data = rdd.collect
data.foreach(record => {
SparkConnectionScalaNew.runmain(record,ssc)
})
}
)
ssc.start() // 真正启动程序
ssc.awaitTermination() //阻塞等待
}
}
3.新建一个类:SparkConnectionScalaNew
import java.util.UUID
import org.apache.spark.streaming.StreamingContext
object SparkConnectionScalaNew {
def runmain(keyValues:String,ssc:StreamingContext): Unit ={
val uuid = UUID.randomUUID().toString
SparkKafkaMessNew.messMain(keyValues,ssc,uuid)
}
}
4.新建一个类:SparkKafkaMessNew
import java.util.Date
import java.text.SimpleDateFormat
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.StreamingContext
import scala.util.parsing.json.JSON
import org.apache.phoenix.spark._
object SparkKafkaMessNew {
def regJson(json:Option[Any]) = json match {
case Some(map: Map[String,Any]) => map
}
//取出 json 格式的数据,用函数进行处理,mess = {"UID":"2019","FAMILY":"new","NAME":"2019-03-15 15:05:19","VALUE1":"12421312421"}
// first = Map(UID -> 2019, FAMILY -> new, NAME -> 2019-03-15 15:05:19, VALUE1 -> 12421312421)
def messMain(mess: String, ssc: StreamingContext, uuid: String): Unit = {
val jsonS = JSON.parseFull(mess)
val first = regJson(jsonS)
println(mess)
println(first)
//把函数解析出来的数据 把列名提取出来 相对应的数据进行修改,写为需要的格式,赋予对象,列名大小写要和 json 格式数据中的列名保持一致
// Some(new) => new
val res = first.get("NAME").toString
val ress = first.get("FAMILY").toString
val name = res.replace("Some(","").replace(")","").trim
val family = ress.replace("Some(","").replace(")","").trim
// val family = res.split(",",1)
// val json = JSON.parse(res)
println(ress)
val date = new Date()
val df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val dateRe = df.format(date)
val sc = ssc.sparkContext
val sqlContext = new SQLContext(sc)
//输入需要写入的对象,相对应的是表的列名
val dataSet = List((uuid,family,name,dateRe))
//sc.parallelize(dataSet).saveToPhoenix("ddd",Seq())
sc.parallelize(dataSet).saveToPhoenix("LOG_ANA.LOGFILE_TEST",
Seq("UID","FAMILY","NAME","VALUE1"),
zkUrl = Some("node1.etonedu.cn:2181")
)
}
}