Spark读取Hdfs上的数据存储在Hbase的ETL过程

最新推荐文章于 2024-03-16 11:50:13 发布

小帅热爱难回头

最新推荐文章于 2024-03-16 11:50:13 发布

阅读量926

点赞数

文章标签： Spark

本文链接：https://blog.csdn.net/weixin_44695980/article/details/93088839

版权

开发：我们再向Hbase中写入数据的时候，尽量批量数据导入，如果一条一条的导入数据，会使得hbase的客户端，频繁的访问habse server,批量导入hbase可以自行参考网上资料规范。

package com.mhl.bigdata

import java.io.{FileInputStream, InputStream}
import java.util.{Properties, UUID}

import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel

object Work {

def main(args: Array[String]): Unit = {

//System.setProperty("HADOOP_USER_NAME","yhl")
var day = "2017-05-27"
var hour = "13"

/**
  * 1 、参数验证
  * 传入参数是天和时间之间会有一个空格，所以args.length要为3
  * 调度任务的粒度是：天和小时
  */
 val propFile = if (args.length >= 2) {
  day = args(0)
  hour = args(1)
  if (args.length >= 3) {
    args(2).trim
  } else {
    ""
  }
} else {
  println("day hour is required!")
  System.exit(1)
  ""
}
println(s"======run day=$day , hour=$hour")

//2、加载配置文件
val (logHdfsPath,table,hbasePort) = loadCfg(propFile)
val inputPath =logHdfsPath
val tablename =table
val HBASEPORT =hbasePort

//3、初始化spark
val spark = initSparkSession()
val sc = spark.sparkContext
sc.setLogLevel("ERROR")
val applicationId = sc.applicationId

//4、创建DataFrame的schemal信息
val schema = new StructType(
  Array(
    StructField("ip", StringType, true),
    StructField("request", StringType, true),
    StructField("fields", MapType(StringType,StringType), true)
  )
)

//5、数据的处理     flume到hdfs上会有很多小文件所以在入库的时候需要做小文件的处理
val inputRDD =sc.textFile(inputPath).distinct()
inputRDD.cache()
val dataCount =inputRDD.count()
System.err.println(s"applicationID_${applicationId} datas items :${dataCount}")

val df = inputRDD.map(x => dataPut(x).split(",")).map(x => Row(x(0), x(1), stringToMap(x(2))))
val resultDF = spark.createDataFrame(df, schema).persist(StorageLevel.MEMORY_AND_DISK_SER)

resultDF.createOrReplaceTempView("wf")

//spark.sql("select ip,request,nvl(fields['d'],0) as dd  from wf").show()
val resultDataFrame: DataFrame = spark.sql("select ip,request,nvl(fields['d'],0) as dd  from wf")

//把dataFrame转换成RDD,然后以RDD的形式导入到hbase
val rowRDD: RDD[Row] = resultDataFrame.rdd
val rdd1: RDD[String] = rowRDD.map(_.toString)
val rdd:RDD[String] =rdd1.map(x=>x.substring(1,x.length-1))


//6、数据加载到Hbase库

//设置zooKeeper集群地址，也可以通过将hbase-site.xml导入classpath，但是建议在程序里这样设置
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","cdh1:2181,cdh2:2181,cdh3:2181")

//hbase的连接  用来打印日志定位问题的
val conn = ConnectionFactory.createConnection(conf)
System.err.println("hbase连接状态:"+conn+"连接成功")
//注意这里是output
conf.set(TableOutputFormat.OUTPUT_TABLE, tablename)

//创建Job
val job = Job.getInstance(conf)
//设置输出的KeyClass
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
//设置输出ValueClass
job.setOutputValueClass(classOf[Result])
//设置OutputFormat
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])

//获取JobConf
val jobConf =job.getConfiguration()

//RDD转换成可以进行HBase表数据写入的格式的RDD   192.168.15.238,GET,2532915
val resultRDD =rdd.map(_.split(",")).map(arr => {
  /*一个Put对象就是一行记录，在构造方法中指定主键
   * 所有插入的数据必须用org.apache.hadoop.hbase.util.Bytes.toBytes方法转换
   * Put.add方法接收三个参数：列族，列名，数据  注意Put.add方法是过时的方法可以用Put.addColumn方法
   */
  //注意：hbase现在只支持String类型数据的存取
  val put = new Put(Bytes.toBytes(UUID.randomUUID().toString))
  put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("ip"), Bytes.toBytes(arr(0)))
  put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("request"), Bytes.toBytes(arr(1)))
  put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("dd"), Bytes.toBytes(arr(2).toString))
  (new ImmutableBytesWritable, put)
})

resultRDD.coalesce(1).saveAsNewAPIHadoopDataset(jobConf)
System.err.println(s"applicationID_${applicationId} datas loading hbase success ！")


spark.stop()

}

/**
* 初始化一个SparkSession
* @return
*/
def initSparkSession(): SparkSession = {
SparkSession.builder()
.appName(“Work”.getClass().getSimpleName())
.master(“local”)
.config(“spark.sql.warehouse.dir”, “/spark-warehouse/”)
.config(“spark.worker.ui.retainedExecutors”, “200”) //减少保存在Worker内存中的Driver,Executor信息
.config(“spark.worker.ui.retainedDrivers”, “200”)
.config(“spark.serializer”, “org.apache.spark.serializer.KryoSerializer”)
// .config(“spark.kryoserializer.buffer”, “1024m”)
// .config(“spark.kryoserializer.buffer.max”, “2046m”)
.config(“spark.io.compression.codec”, “snappy”)
.config(“spark.sql.parquet.binaryAsString”, “true”)
.config(“spark.sql.crossJoin.enabled”, “true”)
.config(“spark.sql.codegen”, “true”)
.config(“spark.sql.unsafe.enabled”, “true”)
.config(“spark.sql.shuffle.partitions”,“200”) //多表查询，涉及到shuffle的操作会开启200个task来处理数据,由参数控制，可
//以人为得调大调小
.config(“spark.shuffle.manager”, “tungsten-sort”)
.config(“spark.network.timeout”, “600”)
.config(“spark.testing.memory”, “471859200”)
.getOrCreate()

}

/**
* 加载配置文件
* @param propFile
* @return
*/
def loadCfg(propFile: String) = {
//加载配置文件，约定为bigdata.properties

val props = new Properties()
val in: InputStream = this.getClass.getClassLoader().getResourceAsStream("bigdata.properties")

//启动时driver classpath设定
if (StringUtils.isBlank(propFile)) {
  if(in != null){
    props.load(in)
  }
  else{
    println(s"======bigdata.properties load fail !")
  }
} else {
  props.load(new FileInputStream(propFile))
}


/*    //库名
    val database = cfgContext.getString(PropertiesConstants.database)
    if (database == null) {
      //tableBasePath is required
      println("properties database is required")
      System.exit(1)
    }*/

//表名
val table = props.getProperty("table")
if (StringUtils.isBlank(table)) {
  println("properties table is required !")
  System.exit(1)
}
println(s"======loading properties tablename is ${table} ")

//日志基础路径，如果没此参数
/*原始数据：192.168.15.238 - - [12/Sep/2018:23:34:03 +0800] "GET /ap?extend=22c0c044cda5269e4c61a9b990812141&openId=oqgRMw1JAPXbo1TJgB_EJjgJCytA&model=CY_WiFi_I1&channel=BigWifi&price=0.2 HTTP/1.1" 200 5 "-" "-" "-"
*/
var logHdfsPath = props.getProperty("logHdfsPath")
if(StringUtils.isBlank(logHdfsPath)){
  println(s"properties logHdfsPath is required !")
  System.exit(1)
}
println(s"======loading properties inputpath is ${logHdfsPath}")

val hbasePort=props.getProperty("hbasePort")
if(StringUtils.isBlank(hbasePort)){
  println(s"properties hbasePort is required !")
  System.exit(1)
}
println(s"======loading properties hbasePort is ${hbasePort}")

(logHdfsPath,table,hbasePort)

}

/**
* 拿到请求方式
* @param line
* @return
*/
def dataPrase(line: String): String = {
// val lineOne: Array[String] = line.split(" “)
val lineTwo: Array[String] = line.split(” “)
lineTwo(5).split(”"")(1)
}

/**
* 拿到ip
* @param line
* @return
*/
def dataPraseIP(line: String): String = {
line.split(" ")(0)
}

/**
* String类型转换成Map类型 val fieldss =“extend:0fd759749ae039fd498a89e6732590682,openId:odfC2jjXiSYrforZPmwS5TNxtelU&mode”
* @param dataLine
* @return
*/
def stringToMap(dataLine: String): Map[String, String] = {

val map = scala.collection.mutable.Map[String, String]()
val datas = dataLine.split(",")
for (i <- datas) {
  val ione = i.split(":")
  map += (ione(0) -> ione(1))
}
//把可变的map要变成不可变的map
map.toMap

}

/**
* 拿到用户的动作行为 extend=3cd9a842fa0a1856805e44c928933038&openId=ovvuQ0SWq3IrvqWb7CFcrknpyGDg&model=BLINK_WR316
* @param line
* @return
*/
def dataPraseID(line: String): String = {
// val map = scala.collection.mutable.MapString, String
val sb = new StringBuilder()
val dataLine = line.split("\?")(1).split("\/")(0)

val datas = dataLine.split("\\&")
for (i <- datas) {
  val ione = i.split("\\=")

  sb.append(ione(0)).append(":").append(ione(1)).append(",")
}
//把可变的map要变成不可变的map
val sb1: String = sb.toString()
sb1.substring(0,sb1.length-1)

}

/**
* 拿到ip、请求方式、用户的动作行为
* @param line
* @return
*/
def dataPut(line: String): String = {
val sb1 = new StringBuilder()
sb1.append(dataPraseIP(line))
.append(",")
.append(dataPrase(line))
.append(",")
.append(dataPraseID(line))
sb1.toString()

}