1,saveAsNewAPIHadoopDataset批量写入(千万级别以下使用)
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object ReadHive2Hbase3 {
/**
* 处理null字段
*
* @param str
* @return
*/
def nullHandle(str: String): String = {
if (str == null || "".equals(str)) {
return "null"
} else {
return str
}
}
def main(args: Array[String]): Unit = {
val dateTime = args(0)
// val conf2 = new SparkConf().setMaster("local[*]")
val conf = new SparkConf()
conf.set("javax.jdo.option.ConnectionURL", "jdbc:mysql://mysql.hadoop:5480/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8")
conf.set("javax.jdo.option.ConnectionDriverName", "com.mysql.jdbc.Driver")
conf.set("javax.jdo.option.ConnectionUserName", "root")
conf.set("javax.jdo.option.ConnectionPassword", "D5u8SS+qCbT8")
val spark = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "spark-warehouse")
.getOrCreate()
val sc = spark.sparkContext
spark.sql("use aijiami")
// val dateTime = "20190718"
//从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样
val hiveData = spark.sql("SELECT " +
"deviceId," +
"shortappkey," +
"province," +
"city," +
"factory," +
"phoneOS," +
"networkType," +
"deviceScreenStr," +
"operateTypeStr," +
"appKey," +
"user_name," +
"user_sex," +
"user_age," +
"user_education," +
"user_occupation," +
"source_channel," +
"urlTime " +
// "from ods_event_detail where dt = " + dateTime + " limit 10")
"from ods_event_detail where dt = " + dateTime)
// hiveData.show(10)
val hconf = sc.hadoopConfiguration
hconf.set("hbase.master", "hdfs://node1.hadoop")
hconf.set("hbase.zookeeper.quorum", "172.10.4.xx,172.10.4.xx,172.10.4.xx")
hconf.set("hbase.zookeeper.property.clientPort", "2181")
hconf.set(TableOutputFormat.OUTPUT_TABLE, "wxgz_user_data")
val job = Job.getInstance(hconf)
job.setOutputKeyClass(classOf[ImmutableBytesWritable])
job.setOutputValueClass(classOf[Result])
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
job.setJobName("hive2hbase")
hiveData.rdd.map(row => {
val shortappkey = nullHandle(row.getAs[String]("shortappkey"))
val deviceId = nullHandle(row.getAs[String]("deviceId"))
val rowkey = HbaseRowKeyUtil.getRowKey(shortappkey, deviceId)
val put = new Put(Bytes.toBytes(rowkey)) //参数为rowkey
put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("deviceId"), Bytes.toBytes(deviceId))
(new ImmutableBytesWritable, put) //返回元组
}).saveAsNewAPIHadoopDataset(job.getConfiguration) //存入HBase
spark.close()
sc.stop();
}
}
2,以bulkload方式写入(亿级别数据):
package com.dianyou
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object ReadHive2Hbase {
/**
* 处理null字段
*
* @param str
* @return
*/
def nullHandle(str: String): String = {
if (str == null || "".equals(str)) {
return "null"
} else {
return str
}
}
def main(args: Array[String]): Unit = {
val conf2 = new SparkConf().setMaster("local[*]")
conf2.set("javax.jdo.option.ConnectionURL", "jdbc:mysql://mysql.hadoop:5480/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8")
conf2.set("javax.jdo.option.ConnectionDriverName", "com.mysql.jdbc.Driver")
conf2.set("javax.jdo.option.ConnectionUserName", "root")
conf2.set("javax.jdo.option.ConnectionPassword", "D5u8SS+qCbT8")
val spark = SparkSession
.builder()
.config(conf2)
.enableHiveSupport()
.config("spark.sql.warehouse.dir", "spark-warehouse")
.getOrCreate()
spark.sql("use aijiami")
val dateTime = "20190718"
//从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样
val hiveData = spark.sql("SELECT " +
"deviceId," +
"shortappkey," +
"province," +
"city," +
"factory," +
"phoneOS," +
"networkType," +
"deviceScreenStr," +
"operateTypeStr," +
"appKey," +
"user_name," +
"user_sex," +
"user_age," +
"user_education," +
"user_occupation," +
"source_channel," +
"urlTime " +
"from ods_event_detail where dt = " + dateTime + " limit 10")
hiveData.show(10)
val dataRdd: RDD[(String, (String, String, String))] = hiveData.rdd.flatMap(row => {
val aa = row.getAs[String]("deviceId")
val shortappkey = nullHandle(row.getAs[String]("shortappkey"))
val deviceId = nullHandle(row.getAs[String]("deviceId"))
if (!"null".equals(deviceId)) {
val rowkey = HbaseRowKeyUtil.getRowKey(shortappkey, deviceId)
Array(
(rowkey, ("cf", "deviceId", nullHandle(row.getAs[String]("deviceId")))),
(rowkey, ("cf", "shortappkey", nullHandle(row.getAs[String]("shortappkey")))),
(rowkey, ("cf", "province", nullHandle(row.getAs[String]("province")))),
(rowkey, ("cf", "city", nullHandle(row.getAs[String]("city")))),
(rowkey, ("cf", "factory", nullHandle(row.getAs[String]("factory")))),
(rowkey, ("cf", "phoneOS", nullHandle(row.getAs[String]("phoneOS")))),
(rowkey, ("cf", "networkType", nullHandle(row.getAs[String]("networkType")))),
(rowkey, ("cf", "deviceScreenStr", nullHandle(row.getAs[String]("deviceScreenStr")))),
(rowkey, ("cf", "operateTypeStr", nullHandle(row.getAs[String]("operateTypeStr")))),
(rowkey, ("cf", "appKey", nullHandle(row.getAs[String]("appKey")))),
(rowkey, ("cf", "user_name", nullHandle(row.getAs[String]("user_name")))),
(rowkey, ("cf", "user_sex", nullHandle(row.getAs[String]("user_sex")))),
(rowkey, ("cf", "user_age", nullHandle(row.getAs[String]("user_age")))),
(rowkey, ("cf", "user_education", nullHandle(row.getAs[String]("user_education")))),
(rowkey, ("cf", "user_occupation", nullHandle(row.getAs[String]("user_occupation")))),
(rowkey, ("cf", "source_channel", nullHandle(row.getAs[String]("source_channel")))),
(rowkey, ("cf", "user_age", nullHandle(row.getAs[String]("user_age")))),
(rowkey, ("cf", "urlTime", nullHandle(row.getAs[String]("urlTime"))))
)
}else{
null
}
})
//要保证行键,列族,列名的整体有序,必须先排序后处理,防止数据异常过滤rowkey
val rdds = dataRdd.filter(x => x._1 != null).sortBy(x => (x._1, x._2._1, x._2._2)).map(x => {
//将rdd转换成HFile需要的格式,Hfile的key是ImmutableBytesWritable,那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
//KeyValue的实例为value
val rowKey = Bytes.toBytes(x._1)
val family = Bytes.toBytes(x._2._1)
val colum = Bytes.toBytes(x._2._2)
val value = Bytes.toBytes(x._2._3)
(new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value))
})
//临时文件保存位置,在hdfs上
val tmpdir = "/tmp/test_hbase"
val hconf = new Configuration()
// hconf.set("fs.defaultFS", "hdfs://node1.hadoop")
hconf.set("hbase.master", "hdfs://node1.hadoop")
//创建HBase的配置
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "172.10.4.95,172.10.4.96,172.10.4.97")
conf.set("hbase.zookeeper.property.clientPort", "2181")
//为了预防hfile文件数过多无法进行导入,设置该参数值
// conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 1000)
//此处运行完成之后,在tmpdir生成的Hfile文件
rdds.saveAsNewAPIHadoopFile(tmpdir,
classOf[ImmutableBytesWritable],
classOf[KeyValue],
classOf[HFileOutputFormat2],
conf)
//开始即那个HFile导入到Hbase,此处都是hbase的api操作
val load = new LoadIncrementalHFiles(conf)
//hbase的表名
val tableName = "wxgz_user_data"
//创建hbase的链接,利用默认的配置文件,实际上读取的hbase的master地址
val conn = ConnectionFactory.createConnection(conf)
//根据表名获取表
val table = conn.getTable(TableName.valueOf(tableName))
try {
//获取hbase表的region分布
val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName))
//创建一个hadoop的mapreduce的job
val job = Job.getInstance(conf)
//设置job名称,随便起一个就行
job.setJobName("ReadHive2Hbase")
//此处最重要,需要设置文件输出的key,因为我们要生成HFil,所以outkey要用ImmutableBytesWritable
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
//输出文件的内容KeyValue
job.setMapOutputValueClass(classOf[KeyValue])
//配置HFileOutputFormat2的信息
HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator)
//开始导入
load.doBulkLoad(new Path(tmpdir), conn.getAdmin, table, regionLocator)
} finally {
table.close()
conn.close()
}
spark.close()
}
}
yarn执行脚本
/zywa/spark/spark-2.1.1-bin-hadoop2.7/bin/spark-submit \
--class com.dianyou.offline.utils.ReadHive2Hbase \
--master yarn \
--deploy-mode cluster \
--num-executors 16 \
--executor-memory 2g \
--executor-cores 2 \
--driver-cores 2 \
--driver-memory 4g \
--name aaa3 \
--conf spark.default.parallelism=100 \
--conf spark.memory.storageFraction=0.4 \
--conf spark.streaming.unpersist=true \
--conf spark.streaming.backpressure.enabled=true \
--conf spark.streaming.kafka.maxRatePerPartition=1500 \
--conf spark.network.timeout=300 \
--conf spark.streaming.kafka.consumer.poll.ms=30000 \
--conf spark.driver.extraJavaOptions="-Dlog4j.configuration=file:log4j.properties" \
--conf spark.executor.extraJavaOptions="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:+CMSParallelRemarkEnabled -XX:+ParallelRefProcEnabled -XX:+CMSClassUnloadingEnabled -XX:MaxTenuringThreshold=15 -XX:SurvivorRatio=5 -XX:MaxDirectMemorySize=1g -Dlog4j.configuration=file:log4j.properties" \
--conf spark.yarn.submit.waitAppCompletion=false \
--conf spark.yarn.maxAppAttempts=4 \
--conf spark.yarn.am.attemptFailuresValidityInterval=1h \
--conf spark.yarn.max.executor.failures=16 \
--conf spark.yarn.executor.failuresValidityInterval=1h \
--conf spark.task.maxFailures=8 \
--files /zywa/job/sparkstreaming/config/log4j.properties \
/zywa/job/sparkstreaming/jars/weixieganzhi.jar 20190803
参考链接https://blog.csdn.net/cuixin20120511/article/details/80647854