开发:我们再向Hbase中写入数据的时候,尽量批量数据导入,如果一条一条的导入数据,会使得hbase的客户端,频繁的访问habse server,批量导入hbase可以自行参考网上资料规范。
package com.mhl.bigdata
import java.io.{FileInputStream, InputStream}
import java.util.{Properties, UUID}
import org.apache.commons.lang3.StringUtils
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{MapType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
object Work {
def main(args: Array[String]): Unit = {
//System.setProperty("HADOOP_USER_NAME","yhl")
var day = "2017-05-27"
var hour = "13"
/**
* 1 、参数验证
* 传入参数是天和时间之间会有一个空格,所以args.length要为3
* 调度任务的粒度是:天和小时
*/
val propFile = if (args.length >= 2) {
day = args(0)
hour = args(1)
if (args.length >= 3) {
args(2).trim
} else {
""
}
} else {
println("day hour is required!")
System.exit(1)
""
}
println(s"======run day=$day , hour=$hour")
//2、加载配置文件
val (logHdfsPath,table,hbasePort) = loadCfg(propFile)
val inputPath =logHdfsPath
val tablename =table
val HBASEPORT =hbasePort
//3、初始化spark
val spark = initSparkSession()
val sc = spark.sparkContext
sc.setLogLevel("ERROR")
val applicationId = sc.applicationId
//4、创建DataFrame的schemal信息
val schema = new StructType(
Array(
StructField("ip", StringType, true),
StructField("request", StringType, true),
StructField("fields", MapType(StringType,StringType), true)
)
)
//5、数据的处理 flume到hdfs上会有很多小文件所以在入库的时候需要做小文件的处理
val inputRDD =sc.textFile(inputPath).distinct()
inputRDD.cache()
val dataCount =inputRDD.count()
System.err.println(s"applicationID_${applicationId} datas items :${dataCount}")
val df = inputRDD.map(x => dataPut(x).split(",")).map(x => Row(x(0), x(1), stringToMap(x(2))))
val resultDF = spark.createDataFrame(df, schema).persist(StorageLevel.MEMORY_AND_DISK_SER)
resultDF.createOrReplaceTempView("wf")
//spark.sql("select ip,request,nvl(fields['d'],0) as dd from wf").show()
val resultDataFrame: DataFrame = spark.sql("select ip,request,nvl(fields['d'],0) as dd from wf")
//把dataFrame转换成RDD,然后以RDD的形式导入到hbase
val rowRDD: RDD[Row] = resultDataFrame.rdd
val rdd1: RDD[String] = rowRDD.map(_.toString)
val rdd:RDD[String] =rdd1.map(x=>x.substring(1,x.length-1))
//6、数据加载到Hbase库
//设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum","cdh1:2181,cdh2:2181,cdh3:2181")
//hbase的连接 用来打印日志定位问题的
val conn = ConnectionFactory.createConnection(conf)
System.err.println("hbase连接状态:"+conn+"连接成功")
//注意这里是output
conf.set(TableOutputFormat.OUTPUT_TABLE, tablename)
//创建Job
val job = Job.getInstance(conf)
//设置输出的KeyClass
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
//设置输出ValueClass
job.setOutputValueClass(classOf[Result])
//设置OutputFormat
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
//获取JobConf
val jobConf =job.getConfiguration()
//RDD转换成可以进行HBase表数据写入的格式的RDD 192.168.15.238,GET,2532915
val resultRDD =rdd.map(_.split(",")).map(arr => {
/*一个Put对象就是一行记录,在构造方法中指定主键
* 所有插入的数据必须用org.apache.hadoop.hbase.util.Bytes.toBytes方法转换
* Put.add方法接收三个参数:列族,列名,数据 注意Put.add方法是过时的方法可以用Put.addColumn方法
*/
//注意:hbase现在只支持String类型数据的存取
val put = new Put(Bytes.toBytes(UUID.randomUUID().toString))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("ip"), Bytes.toBytes(arr(0)))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("request"), Bytes.toBytes(arr(1)))
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("dd"), Bytes.toBytes(arr(2).toString))
(new ImmutableBytesWritable, put)
})
resultRDD.coalesce(1).saveAsNewAPIHadoopDataset(jobConf)
System.err.println(s"applicationID_${applicationId} datas loading hbase success !")
spark.stop()
}
/**
* 初始化一个SparkSession
* @return
*/
def initSparkSession(): SparkSession = {
SparkSession.builder()
.appName(“Work”.getClass().getSimpleName())
.master(“local”)
.config(“spark.sql.warehouse.dir”, “/spark-warehouse/”)
.config(“spark.worker.ui.retainedExecutors”, “200”) //减少保存在Worker内存中的Driver,Executor信息
.config(“spark.worker.ui.retainedDrivers”, “200”)
.config(“spark.serializer”, “org.apache.spark.serializer.KryoSerializer”)
// .config(“spark.kryoserializer.buffer”, “1024m”)
// .config(“spark.kryoserializer.buffer.max”, “2046m”)
.config(“spark.io.compression.codec”, “snappy”)
.config(“spark.sql.parquet.binaryAsString”, “true”)
.config(“spark.sql.crossJoin.enabled”, “true”)
.config(“spark.sql.codegen”, “true”)
.config(“spark.sql.unsafe.enabled”, “true”)
.config(“spark.sql.shuffle.partitions”,“200”) //多表查询,涉及到shuffle的操作会开启200个task来处理数据,由参数控制,可
//以人为得调大调小
.config(“spark.shuffle.manager”, “tungsten-sort”)
.config(“spark.network.timeout”, “600”)
.config(“spark.testing.memory”, “471859200”)
.getOrCreate()
}
/**
* 加载配置文件
* @param propFile
* @return
*/
def loadCfg(propFile: String) = {
//加载配置文件,约定为bigdata.properties
val props = new Properties()
val in: InputStream = this.getClass.getClassLoader().getResourceAsStream("bigdata.properties")
//启动时driver classpath设定
if (StringUtils.isBlank(propFile)) {
if(in != null){
props.load(in)
}
else{
println(s"======bigdata.properties load fail !")
}
} else {
props.load(new FileInputStream(propFile))
}
/* //库名
val database = cfgContext.getString(PropertiesConstants.database)
if (database == null) {
//tableBasePath is required
println("properties database is required")
System.exit(1)
}*/
//表名
val table = props.getProperty("table")
if (StringUtils.isBlank(table)) {
println("properties table is required !")
System.exit(1)
}
println(s"======loading properties tablename is ${table} ")
//日志基础路径,如果没此参数
/*原始数据:192.168.15.238 - - [12/Sep/2018:23:34:03 +0800] "GET /ap?extend=22c0c044cda5269e4c61a9b990812141&openId=oqgRMw1JAPXbo1TJgB_EJjgJCytA&model=CY_WiFi_I1&channel=BigWifi&price=0.2 HTTP/1.1" 200 5 "-" "-" "-"
*/
var logHdfsPath = props.getProperty("logHdfsPath")
if(StringUtils.isBlank(logHdfsPath)){
println(s"properties logHdfsPath is required !")
System.exit(1)
}
println(s"======loading properties inputpath is ${logHdfsPath}")
val hbasePort=props.getProperty("hbasePort")
if(StringUtils.isBlank(hbasePort)){
println(s"properties hbasePort is required !")
System.exit(1)
}
println(s"======loading properties hbasePort is ${hbasePort}")
(logHdfsPath,table,hbasePort)
}
/**
* 拿到请求方式
* @param line
* @return
*/
def dataPrase(line: String): String = {
// val lineOne: Array[String] = line.split(" “)
val lineTwo: Array[String] = line.split(” “)
lineTwo(5).split(”"")(1)
}
/**
* 拿到ip
* @param line
* @return
*/
def dataPraseIP(line: String): String = {
line.split(" ")(0)
}
/**
* String类型转换成Map类型 val fieldss =“extend:0fd759749ae039fd498a89e6732590682,openId:odfC2jjXiSYrforZPmwS5TNxtelU&mode”
* @param dataLine
* @return
*/
def stringToMap(dataLine: String): Map[String, String] = {
val map = scala.collection.mutable.Map[String, String]()
val datas = dataLine.split(",")
for (i <- datas) {
val ione = i.split(":")
map += (ione(0) -> ione(1))
}
//把可变的map要变成不可变的map
map.toMap
}
/**
* 拿到用户的动作行为 extend=3cd9a842fa0a1856805e44c928933038&openId=ovvuQ0SWq3IrvqWb7CFcrknpyGDg&model=BLINK_WR316
* @param line
* @return
*/
def dataPraseID(line: String): String = {
// val map = scala.collection.mutable.MapString, String
val sb = new StringBuilder()
val dataLine = line.split("\?")(1).split("\/")(0)
val datas = dataLine.split("\\&")
for (i <- datas) {
val ione = i.split("\\=")
sb.append(ione(0)).append(":").append(ione(1)).append(",")
}
//把可变的map要变成不可变的map
val sb1: String = sb.toString()
sb1.substring(0,sb1.length-1)
}
/**
* 拿到ip、请求方式、用户的动作行为
* @param line
* @return
*/
def dataPut(line: String): String = {
val sb1 = new StringBuilder()
sb1.append(dataPraseIP(line))
.append(",")
.append(dataPrase(line))
.append(",")
.append(dataPraseID(line))
sb1.toString()
}
}