[spark]Spark2.4.6用bulkload写入Hbase1.3.1表的多列

 

目录

1.创建Hbase表

2.测试数据文件 test2.txt

3. Spark2.4.6用 bulkload写入Hbase1.3.1


批量写入的优势:

  1. 不使用预写日志(WAL),不会出现flush和split
  2. 更少的垃圾回收

处理流程:

 

流程:

  1. 把外部数据导入HDFS
  2. 用spark 把数据处理成hbase的文件hfile所需格式,保存入hdfs
  3. 调用spark on hbase 的bulkLoad api 加载入hbase 目标表

1.创建Hbase表

create_namespace 'defaut'

create 'defaut:t_test', 'DATA'

2.测试数据文件 test2.txt

1595638951700,1,1.1939971,1.4677016,1.4034922

1595638951721,1,1.3716854,1.566847,1.4458307

1595638951723,2,1.3352232,1.4566108,1.5208404

1595638951715,1,1.8877013,1.1247256,1.6103745

1595638951696,2,1.2885377,1.7600425,1.4150856

1595638951707,1,1.8486422,1.1446141,1.5813918

1595638951694,3,1.2366319,1.4496765,1.7620823

1595638951740,1,1.9078307,1.7746134,1.337183

1595638951714,3,1.261858,1.2809255,1.4845717

1595638951697,2,1.5660034,1.0154893,1.6899275

3. Spark2.4.6用 bulkload写入Hbase1.3.1

package mySpark;
import org.apache.spark.sql.SparkSession;
import myHDFS.MyHDFS;
import myHbase.MyHbase;
import security.MyLoginCommon;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.spark.JavaHBaseContext;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

public class MySpark3 implements Serializable{

         private SparkSession sparkSession=null;

         public static void main(String[] args) throws Exception {

                  String hfilePath = "hdfs://hacluster/user/test2.hfile";

                  String sourcefileName = "hdfs://hacluster/user/test2.txt";

                  Long start=System.currentTimeMillis();

                  MySpark3 mySpark3=new MySpark3();

                  mySpark3. writeHbaseWithBuldLoadMultiCol (sourcefileName,hfilePath);

                  Long end=System.currentTimeMillis();

                  System.out.println("共花費:["+(end-start)/1000+"]秒。");

         }

 public void writeHbaseWithBuldLoadMultiCol(String sourceFileName,String hfilePath,String hbaseNamespace,String hbaseTableName,String hbaseFamily) throws Exception

    {   

          JavaSparkContext javaSparkContext = new JavaSparkContext(sparkSession.sparkContext());

           Log.info("Load file["+sourceFileName+" to spark");

JavaRDD<String> originRDD = javaSparkContext.textFile(sourceFileName);

          Configuration hbaseConf=MyLoginCommon.loginHbase();

          Connection hbaseConn =MyLoginCommon.getHbaseConn();

          JavaHBaseContext hbaseContext = new JavaHBaseContext(javaSparkContext, hbaseConf);

          String tableName="default:t_test";

          Log.info("sort through rowkey");

JavaPairRDD<String, String>  javaPairRDD= originRDD.mapToPair(new PairFunction<String,String,String>(){

              @Override

              public Tuple2<String, String> call(String s) throws Exception {

                   String []a=s.split(",");

                  return new Tuple2<String,String>(a[0],s);

              }    

             }) ;

          Log.info("sort the rowkey");

JavaPairRDD<String, String>  javaPairRDDSort=  javaPairRDD.sortByKey();
JavaRDD<String> dataRDD=javaPairRDDSort.map(new Function<Tuple2<String,String>,String>(){



              @Override

              public String call(Tuple2<String, String> s) throws Exception { TODO Auto-generated method stub

                  return s._2;

              }});

         
Log.info("product hfile formate file");
JavaPairRDD<ImmutableBytesWritable, KeyValue>  javaFlatPairRDD= dataRDD.flatMapToPair(new PairFlatMapFunction<String,ImmutableBytesWritable,KeyValue>(){
              @Override

              public Iterator<Tuple2<ImmutableBytesWritable, KeyValue>> call(String s) throws Exception {

                  List<Tuple2<ImmutableBytesWritable, KeyValue>> list=new ArrayList<Tuple2<ImmutableBytesWritable, KeyValue>>();

                   String []strArr=s.split(",");

                   for(int i=2;i<strArr.length;i++){

                      String rowkey=strArr[0];

KeyValue keyValue = new KeyValue(Bytes.toBytes(rowkey), Bytes.toBytes("DATA"), Bytes.toBytes("i"+(i-1)), Bytes.toBytes(strArr[i]));

                      ImmutableBytesWritable rrk=new ImmutableBytesWritable(Bytes.toBytes(rowkey));

                      Tuple2<ImmutableBytesWritable, KeyValue> tuple=new Tuple2<ImmutableBytesWritable, KeyValue>(rrk,keyValue);

                      list.add(tuple);                      

                   }

                  return list.iterator();

               }    

             });

Log.info("sava hfile to hdfs.");

javaFlatPairRDD.saveAsNewAPIHadoopFile(hfilePath, ImmutableBytesWritable.class, KeyValue.class, HFileOutputFormat2.class, hbaseConf);

Log.info("Load hfile to table["+tableName+"]");

LoadIncrementalHFiles load = new LoadIncrementalHFiles(hbaseConf);

         load.doBulkLoad( new Path(hfilePath),

                hbaseConn.getAdmin(),

                hbaseConn.getTable(TableName.valueOf(tableName)),

                hbaseConn.getRegionLocator(TableName.valueOf(tableName)));

         javaSparkContext.close();

         sparkSession.close();
    }

}

 

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值