目录
3. Spark2.4.6用 bulkload写入Hbase1.3.1
批量写入的优势:
- 不使用预写日志(WAL),不会出现flush和split
- 更少的垃圾回收
处理流程:
流程:
- 把外部数据导入HDFS
- 用spark 把数据处理成hbase的文件hfile所需格式,保存入hdfs
- 调用spark on hbase 的bulkLoad api 加载入hbase 目标表
1.创建Hbase表
create_namespace 'defaut'
create 'defaut:t_test', 'DATA'
2.测试数据文件 test2.txt
1595638951700,1,1.1939971,1.4677016,1.4034922
1595638951721,1,1.3716854,1.566847,1.4458307
1595638951723,2,1.3352232,1.4566108,1.5208404
1595638951715,1,1.8877013,1.1247256,1.6103745
1595638951696,2,1.2885377,1.7600425,1.4150856
1595638951707,1,1.8486422,1.1446141,1.5813918
1595638951694,3,1.2366319,1.4496765,1.7620823
1595638951740,1,1.9078307,1.7746134,1.337183
1595638951714,3,1.261858,1.2809255,1.4845717
1595638951697,2,1.5660034,1.0154893,1.6899275
3. Spark2.4.6用 bulkload写入Hbase1.3.1
package mySpark;
import org.apache.spark.sql.SparkSession;
import myHDFS.MyHDFS;
import myHbase.MyHbase;
import security.MyLoginCommon;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;
import org.apache.hadoop.hbase.spark.JavaHBaseContext;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
public class MySpark3 implements Serializable{
private SparkSession sparkSession=null;
public static void main(String[] args) throws Exception {
String hfilePath = "hdfs://hacluster/user/test2.hfile";
String sourcefileName = "hdfs://hacluster/user/test2.txt";
Long start=System.currentTimeMillis();
MySpark3 mySpark3=new MySpark3();
mySpark3. writeHbaseWithBuldLoadMultiCol (sourcefileName,hfilePath);
Long end=System.currentTimeMillis();
System.out.println("共花費:["+(end-start)/1000+"]秒。");
}
public void writeHbaseWithBuldLoadMultiCol(String sourceFileName,String hfilePath,String hbaseNamespace,String hbaseTableName,String hbaseFamily) throws Exception
{
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkSession.sparkContext());
Log.info("Load file["+sourceFileName+" to spark");
JavaRDD<String> originRDD = javaSparkContext.textFile(sourceFileName);
Configuration hbaseConf=MyLoginCommon.loginHbase();
Connection hbaseConn =MyLoginCommon.getHbaseConn();
JavaHBaseContext hbaseContext = new JavaHBaseContext(javaSparkContext, hbaseConf);
String tableName="default:t_test";
Log.info("sort through rowkey");
JavaPairRDD<String, String> javaPairRDD= originRDD.mapToPair(new PairFunction<String,String,String>(){
@Override
public Tuple2<String, String> call(String s) throws Exception {
String []a=s.split(",");
return new Tuple2<String,String>(a[0],s);
}
}) ;
Log.info("sort the rowkey");
JavaPairRDD<String, String> javaPairRDDSort= javaPairRDD.sortByKey();
JavaRDD<String> dataRDD=javaPairRDDSort.map(new Function<Tuple2<String,String>,String>(){
@Override
public String call(Tuple2<String, String> s) throws Exception { TODO Auto-generated method stub
return s._2;
}});
Log.info("product hfile formate file");
JavaPairRDD<ImmutableBytesWritable, KeyValue> javaFlatPairRDD= dataRDD.flatMapToPair(new PairFlatMapFunction<String,ImmutableBytesWritable,KeyValue>(){
@Override
public Iterator<Tuple2<ImmutableBytesWritable, KeyValue>> call(String s) throws Exception {
List<Tuple2<ImmutableBytesWritable, KeyValue>> list=new ArrayList<Tuple2<ImmutableBytesWritable, KeyValue>>();
String []strArr=s.split(",");
for(int i=2;i<strArr.length;i++){
String rowkey=strArr[0];
KeyValue keyValue = new KeyValue(Bytes.toBytes(rowkey), Bytes.toBytes("DATA"), Bytes.toBytes("i"+(i-1)), Bytes.toBytes(strArr[i]));
ImmutableBytesWritable rrk=new ImmutableBytesWritable(Bytes.toBytes(rowkey));
Tuple2<ImmutableBytesWritable, KeyValue> tuple=new Tuple2<ImmutableBytesWritable, KeyValue>(rrk,keyValue);
list.add(tuple);
}
return list.iterator();
}
});
Log.info("sava hfile to hdfs.");
javaFlatPairRDD.saveAsNewAPIHadoopFile(hfilePath, ImmutableBytesWritable.class, KeyValue.class, HFileOutputFormat2.class, hbaseConf);
Log.info("Load hfile to table["+tableName+"]");
LoadIncrementalHFiles load = new LoadIncrementalHFiles(hbaseConf);
load.doBulkLoad( new Path(hfilePath),
hbaseConn.getAdmin(),
hbaseConn.getTable(TableName.valueOf(tableName)),
hbaseConn.getRegionLocator(TableName.valueOf(tableName)));
javaSparkContext.close();
sparkSession.close();
}
}