Spark Streaming篇1:Spark Streaming 把数据写到hbase,并拼接rowkey
废话不多说,直接上干货
package com.iflytek.kafka
import java.text.SimpleDateFormat
import com.alibaba.fastjson.JSON
import com.iflytek.kafkaManager.HbaseSink
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.slf4j.LoggerFactory
import scala.util.Try
/*
*/
object WC2Hbase {
@transient lazy val logger=LoggerFactory.getLogger(this.getClass())
def send2MysqlMian(ssc: StreamingContext):Unit={
ssc.checkpoint("hdfs://cdh01:8020/user/hive/warehouse/checkpointed/sdf")
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "cdh01:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "xx001",
"auto.offset.reset" -> "latest", //earliest latest
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("pd_ry_txjl")
val stream: InputDStream[ConsumerRecord[String, String]] =
KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
val kv: DStream[(String, String)] = stream.map(record => (record.key, record.value))
val value: DStream[String] = stream.map(_.value())
val mapDS = value.map(x => {
val dataFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val nObject = JSON.parseObject(x)
val bodyObject1 = nObject.getJSONObject("body")
val bodyObject2 = bodyObject1.getJSONObject("body")
val xqbm = bodyObject2.get("name").toString
(scala.util.Random.nextInt(10)+"_"+xqbm, 1)//这里我写得玩的,加了一个随机数前缀,实际操作可以在这里拼row_key,加个时间前缀
})
mapDS.foreachRDD(rdd=>{
if(!rdd.isEmpty()){
send2Hbase(rdd)
}
})
}
def send2Hbase(rdd:RDD[(String, Int)]):Unit={
if(!rdd.isEmpty){
rdd.foreachPartition(fp=>{
// @transient lazy val conn = HbaseSink.getHbaseConn
val conn = HbaseSink.getHbaseConn
fp.foreach(f => {
// 获取指定表的连接
val table = conn.getTable(TableName.valueOf("xy"))
try {
val put = new Put(Bytes.toBytes(f._1.toString))
put.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("count"), Bytes.toBytes(f._2.toString))
Try(table.put(put)).getOrElse(table.close())
} catch {
case e: Exception => e.printStackTrace()
logger.info("rdd写入hbase失败")
} finally {
table.close
}
})
})
logger.info("写入hbase成功")
}
}
}
package com.iflytek.kafkaManager
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory}
import org.apache.log4j.{LogManager, Logger}
import org.slf4j.LoggerFactory
object HbaseSink extends Serializable{
@transient lazy val logger=LoggerFactory.getLogger(this.getClass())
private val conf=HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","cdh01,cdh02,cdh03")
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("zookeeper.znode.parent", "/hbase")
private val conn=ConnectionFactory.createConnection(conf)
// 获取hbase连接
def getHbaseConn:Connection=conn
}
pom如下:
<properties>
<spark.version>2.3.2</spark.version>
<scala.version>2.11.8</scala.version>
<hbase.version>1.2.1</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.31</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>c3p0</groupId>
<artifactId>c3p0</artifactId>
<version>0.9.1.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.jolbox</groupId>
<artifactId>bonecp</artifactId>
<version>0.8.0.RELEASE</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.13</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
<scope>compile</scope>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-client</artifactId>-->
<!--<version>2.7.2</version>-->
<!--</dependency>-->
<!--<!–guava和hadoop版本得对应–>-->
<!--<dependency>-->
<!--<groupId>com.google.guava</groupId>-->
<!--<artifactId>guava</artifactId>-->
<!--<version>18.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<id>compile-scala</id>
<phase>compile</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>test-compile-scala</id>
<phase>test-compile</phase>
<goals>
<goal>add-source</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>2.11.8</scalaVersion>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<compilerArgs>
<arg>-extdirs</arg>
<arg>${project.basedir}/lib</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>