优点:
-
如果我们一次性入库hbase巨量数据,处理速度慢不说,还特别占用Region资源, 一个比较高效便捷的方法就是使用 “Bulk Loading”方法,即HBase提供的HFileOutputFormat类。
-
它是利用hbase的数据信息按照特定格式存储在hdfs内这一原理,直接生成这种hdfs内存储的数据格式文件,然后上传至合适位置,即完成巨量数据快速入库的办法。配合mapreduce完成,高效便捷,而且不占用region资源,增添负载。
限制:
-
仅适合初次数据导入,即表内数据为空,或者每次入库表内都无数据的情况。
-
HBase集群与Hadoop集群为同一集群,即HBase所基于的HDFS为生成HFile的MR的集群:
代码编写:
提前在hbase中创建好表
生成hfile基本流程:
1.设置Mapper的输出KV类型
K :ImmutableBytesWritable 代表行键
V : KeyValue 代表之值
2.开发Mapper
从原数据表中读取数据,按照你的要求进行处理;
输出rowkey作为K,,输出一些keyvalue(put)作为V
3.配置job参数
zookeeper的连接地址
配置输出 OutputFormat为HFileOutputFormat2,并为其设置参数
4.提交job
导入HFile到RegionServer 的流程
构建一个表描述对象
构建一个Region定位工具
然后用LoadIncrementalHFiles来doBulkload操作
package bolkloading; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.mapreduce.*; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.FileInputStream; import java.io.IOException; import java.util.SimpleTimeZone; /** * 设置Mapper的kv类型 **/ class BulkLoadingMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, KeyValue> { //因为是继承至Mapper所以要重写Mapper方法 @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { super.map(key, value, context); //1.对数据做分割,这里以tab键为一个分割 \t String[] split = value.toString().split("\t"); //判断数据的长度,什么时候开始做处理 if (split.length>6){ String phoneName = split[0]; String wgId = split[1]; String city = split[2]; String qxId = split[3]; String waitTime = split[4]; String startTime = split[5]; String endTime = split[6]; String dataTime = split[7]; //设计行键,将电话编号和城市编号下划线连接,方便后面分割 String rowkey = phoneName+"_"+wgId; //包装keyvalue(行键,列簇,列名,列值) KeyValue wgId2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "wgId".getBytes(), wgId.getBytes()); KeyValue city2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "city".getBytes(), city.getBytes()); KeyValue quId2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "quId".getBytes(), qxId.getBytes()); KeyValue waitTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "waitTime".getBytes(),waitTime.getBytes()); KeyValue startTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "startTime".getBytes(),startTime.getBytes()); KeyValue endTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "endTime".getBytes(),endTime.getBytes()); KeyValue dataTime2 = new KeyValue(rowkey.getBytes(), "info".getBytes(), "dataTime".getBytes(),dataTime.getBytes()); //现在rowkey和value都有了,用context.write() 往里面写数据,要写入的rowkey要求是ImmutableBytesWritable类型的所以包装一下 ImmutableBytesWritable rowkey2 = new ImmutableBytesWritable(rowkey.getBytes()); context.write(rowkey2,wgId2); context.write(rowkey2,city2); context.write(rowkey2,quId2); context.write(rowkey2,wgId2); context.write(rowkey2,waitTime2); context.write(rowkey2,startTime2); context.write(rowkey2,endTime2); context.write(rowkey2,dataTime2); } } } /** * 配置job参数 * 现在rowkey2是行键 wgId2是单元格,由于hbase表的列簇是info,所以要与之映射,不然添加不进去 **/ public class BolkLoad { public static void main(String[] args) throws Exception { //1.获取配置文件,获取hbase或hadoop的都可以原因是hbase与hadoop集群是一样的,这里获取hbase,方便文件转成HFile Configuration conf = HBaseConfiguration.create(); //2.获取zookeeper相关配置 conf.set("hbase.zookeeper.quorum","mater:2181,node1:2181,node2:2181"); //3.创建Job作业 Job job = Job.getInstance(); //设置job名字 job.setJobName("BolkLoad dianxin_data"); //设置job主类名 job.setJarByClass(BolkLoad.class); //注意:这里可以设置reduce任务个数,但不会生效,reduce任务是由region个数决定的 job.setNumReduceTasks(4); //4.配置Map相关的内容 //设置Map的类 job.setMapperClass(BulkLoadingMapper.class); //设置Map的K V job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); //5.由于写入不是有序的,所以要设置一个他的有序写入 job.setPartitionerClass(SimpleTotalOrderPartitioner.class); //6.配置Reduce保证分区是有序的 job.setReducerClass(KeyValueSortReducer.class); //7.用FileInputFormat.addInputPath配置输入路径,new Path指定这个dianxin_bulk文件在hdfs下面的路径 FileInputFormat.addInputPath(job,new Path("/data/DIANXIN/input/dianxin_data")); //7.1配置输出路径 FileOutputFormat.setOutputPath(job,new Path("/data/DIANXIN/output/dianxin_data")); //获取表,首先获取连接,然后拿这个连接调用getAdmin获取master相关东西,然后getTable获取表,需要新建传一个tableName.valueof类型的表名; //通过getRegionLocator
获取region的所在位置
Connection conn = ConnectionFactory.createConnection(conf); Admin admin = conn.getAdmin(); Table dianxin_bulk = conn.getTable(TableName.valueOf("dianxin_bulk")); RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf("dianxin_bulk")); //通过HFileOutputmat2将文件生成HFile文件,()里传 (job任务,table表达对应的表名叫什么,Region表的位置信息) HFileOutputFormat2.configureIncrementalLoad(job,dianxin_bulk,regionLocator); System.out.println("===========HFile文件生成成功,对应HBase表:dianxin_bulk========"); //8.开始执行MapperReduce任务 boolean b = job.waitForCompletion(true); //这个返回值b决定了是否成功启动,所以需要if判断 if (b){ System.out.println("数据导入成功,开始将HFile文件加载到HBase中"); //8.1 将HFile文件加载到HBase中,先创建一个类 LoadIncrementalHFiles load = new LoadIncrementalHFiles(conf); //8.2用这个类返回值调用doBulkLoad开始加载;传入四个参数(输出路径,,mater对象admin,表名,region的信息) load.doBulkLoad(new Path("/data/DIANXIN/output/dianxin_data"),admin,dianxin_bulk,regionLocator); System.out.println("文件加载成功"); }else { System.out.println("数据导入失败"); } //=========到此输入已经导入成功=============下面开始将文件上传到HDFS中 /** * 1.将文件上传到HDFS上 :/data/DIANXIN/input/dianxin_data * 2.在HBase中 建表: create 表名 列簇 create 'dianxin_bulk' , 'info' * 3.打包上传 * 4.执行 hadoop jar 包名.jar 相对路径.类名 * hadoop jar ....jar .....BolkLoad * * */ } }
上传流程:
1.选择这个项目,点击package打包;
2.若无代码中的目录则需要新建;
去hadoop命令窗口里输入 hadoop fs -mkdir -p /data/DIANXIN/ input
master:50070去web hadoop页面查看;
3.打包好的文件在taget/maven-archiver下;执行 hadoop jar 包名.jar 相对路径.类名
4.命令窗口 scan查看是否放入成功
说明:
1. 最终输出结果,无论是map还是reduce,输出部分key和value的类型必须是: < ImmutableBytesWritable, KeyValue>或者< ImmutableBytesWritable, Put>。
2. 最终输出部分,Value类型是KeyValue 或Put,对应的Sorter分别是KeyValueSortReducer或PutSortReducer。
3. MR例子中HFileOutputFormat2.configureIncrementalLoad(job, dianxin_bulk, regionLocator);自动对job进行配置。SimpleTotalOrderPartitioner是需要先对key进行整体排序,然后划分到每个reduce中,保证每一个reducer中的的key最小最大值区间范围,是不会有交集的。因为入库到HBase的时候,作为一个整体的Region,key是绝对有序的。
4. MR例子中最后生成HFile存储在HDFS上,输出路径下的子目录是各个列族。如果对HFile进行入库HBase,相当于move HFile到HBase的Region中,HFile子目录的列族内容没有了,但不能直接使用mv命令移动,因为直接移动不能更新HBase的元数据。
5. HFile入库到HBase通过HBase中 LoadIncrementalHFiles的doBulkLoad方法,对生成的HFile文件入库
所需依赖:父工程
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.shujia</groupId> <artifactId>hadoop_bigdata</artifactId> <packaging>pom</packaging> <version>1.0-SNAPSHOT</version> <modules> <module>hadoop-hive</module> <module>hadoop-hbase</module> </modules> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <hadoop-version>2.7.6</hadoop-version> <junit-version>4.13.1</junit-version> </properties> <dependencyManagement> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop-version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop-version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop-version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit-version}</version> </dependency> <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer --> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc --> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>1.4.6</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>1.4.6</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.49</version> </dependency> </dependencies> </dependencyManagement> </project>
hadoop-hbase 子工程:
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <!-- <parent> <artifactId>hadoop_bigdata</artifactId> <groupId>com.shujia</groupId> <version>1.0-SNAPSHOT</version> </parent>--> <modelVersion>4.0.0</modelVersion> <groupId>com.shujia</groupId> <artifactId>hadoop-hbase</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <hadoop-version>2.7.6</hadoop-version> <junit-version>4.13.1</junit-version> </properties> <dependencies> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>${hadoop-version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>${hadoop-version}</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs --> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>${hadoop-version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer --> <dependency> <groupId>com.janeluo</groupId> <artifactId>ikanalyzer</artifactId> <version>2012_u6</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc --> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-jdbc</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>1.4.6</version> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>1.4.6</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.49</version> </dependency> </dependencies> <build> <plugins> <!-- compiler插件, 设定JDK版本 --> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>2.3.2</version> <configuration> <encoding>UTF-8</encoding> <source>1.8</source> <target>1.8</target> <showWarnings>true</showWarnings> </configuration> </plugin> <!-- 带依赖jar 插件--> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>